(I) Background

Instructor: Peng Wang, AVP, Head of Data Science - Operation & Fraud Detection at MassMutual Financial Group
Goal:
- Detect some outliers in a time series.
- Write a general outlier detection algorithm.
The data we used in this project is oil.price, from R library TSA.
- Monthly spot price for crude oil, Cushing, OK (in U.S. dollars per barrel), 01/1986 - 01/2006.
- Source: tonto.eia.doe.gov/dnav/pet/hist/rwtcM.htm

(II) Data Analysis

1. Visualize the time series

data("oil.price")
oil.price %>%
  plot(ylab = "oil price ($)", main = "Oil Price 01/1986 - 01/2006")

2. Decompose the time series

oil.price %>%
  decompose() %>%
  plot()

3. Extract the general trend and periodical trend

# remove linear trend
diff_1 = oil.price %>% diff()
diff_1 %>%
  plot(ylab = "oil price ($)",
       main = "1st difference")

# remove seasonal trend
diff_2 = diff_1 %>% diff(lag = 12)
diff_2 %>%
  plot(ylab = "oil price ($)",
       main = "2nd difference")

4. Fit a time series model to the remaining residuals

# model 1
diff_1 %>%
  armasubsets(6, 6) %>%
  plot()

model_1 = arima(oil.price,
                order = c(5, 1, 1),
                fixed = c(0, 0, 0, NA, NA, NA),
                transform.pars = FALSE)
paste0("AIC: ", model_1 %>% AIC() %>% round(2),
       "\nBIC: ", model_1 %>% BIC() %>% round(2)) %>%
  cat()

AIC: 1027.16
BIC: 1041.09

# model 2
diff_2 %>%
  armasubsets(6, 6) %>%
  plot()

model_2 = arima(oil.price,
                order = c(4, 2, 0),
                fixed = c(NA, NA, 0, NA),
                transform.pars = FALSE)
paste0("AIC: ", model_2 %>% AIC() %>% round(2),
       "\nBIC: ", model_2 %>% BIC() %>% round(2)) %>%
  cat()

AIC: 1092.39
BIC: 1106.3

# compare mean of two series
paste0("mean of diff_1: ", diff_1 %>% mean() %>% round(4),
       "\nmean of diff_2: ", diff_2 %>% mean() %>% round(4)) %>%
  cat()

mean of diff_1: 0.1773
mean of diff_2: 0.1005

From the AIC and BIC results, model_1 is better than model_2, although the second differencing with lag = 12 has a smaller mean value.

5. Use the model to determine if a data point is an outlier

# calculate fitted data (simulation)
fitted_data = model_1 %>% fitted()

# calculate outliers
upper = fitted_data + 2.5 * sqrt(model_1 %>% use_series(sigma2))
lower = fitted_data - 2.5 * sqrt(model_1 %>% use_series(sigma2))
outliers = oil.price %>% is_less_than(lower) %>% or(oil.price %>% is_greater_than(upper))
paste0("Outliers at: ",
       outliers %>%
         equals(TRUE) %>%
         which() %>%
         paste(collapse = ", ")) %>%
  cat()

Outliers at: 2, 56, 180, 226, 227, 234

# plot fitted data, original data, and outliers
oil.price %>%
  plot(main = "Oil Price Simulation with Outlier Points",
       ylab = "Oil Price")
fitted_data %>%
  lines(col = 2,
        lty = 2)
legend("topleft",
       legend = c("Original", "Simulation"),
       lty = 1:2,
       col = 1:2)
points(x = oil.price %>% time() %>% extract(outliers),
       y = oil.price %>% extract(outliers),
       pch = 19)

6. Plot outliers in the following time series

oil.price %>%
  plot(main = "Oil Price Simulation with Outlier Points",
       ylab = "Oil Price",
       type = "n",
       ylim = range(lower, upper))
polygon(c(oil.price %>% time(), oil.price %>% time() %>% rev()),
        c(upper, lower %>% rev()),
        col = rgb(0, 0, 0.6, 0.2),
        border = FALSE)
oil.price %>%
  lines()
points(x = oil.price %>% time() %>% extract(outliers),
       y = oil.price %>% extract(outliers),
       pch = 19)

(II) General Outlier Dectection Algorithm

1. Algorithm Design

# helper 1
h_get_BIC_value = function(input_diff_data,
                           input_max_ar,
                           input_max_ma,
                           input_BIC_values_position) {
  # Get all BIC values
  BIC_values = input_diff_data %>%
    armasubsets(input_max_ar, input_max_ma) %>%
    summary() %>%
    extract2(input_BIC_values_position)
  # Find out the smallest BIC value
  BIC_value = which(BIC_values == min(BIC_values))
  return(BIC_value)
}

# helper 2
h_get_AR_or_MA_shaded_box_logic_values = function(input_diff_data,
                                                  input_max_ar,
                                                  input_max_ma,
                                                  input_AR_MA_position,
                                                  input_BIC_value,
                                                  input_AR_or_MA_start,
                                                  input_AR_or_MA_end) {
  AR_or_MA_shaded_box_logic_values = input_diff_data %>%
    armasubsets(input_max_ar, input_max_ma) %>%
    summary() %>%
    extract2(input_AR_MA_position) %>%
    extract(input_BIC_value, input_AR_or_MA_start : input_AR_or_MA_end) %>%
    unname()
  return(AR_or_MA_shaded_box_logic_values)
}

# helper 3
h_get_model_AR_or_MA_order = function(input_model_AR_or_MA_position) {
  if (input_model_AR_or_MA_position %>% length() %>% equals(0)) {
    model_AR_or_MA_order = 0
  }
  else {
    model_AR_or_MA_order = input_model_AR_or_MA_position %>% max()
  }
  return(model_AR_or_MA_order)
}

# Build ARIMA model by given data
build_arima_model = function(input_data){
  ### Pick 6 as the maximum AR and MA value
  max_ar = 6 # user adjustable
  max_ma = 6 # user adjustable
  
  ### diff_1 is normal difference of input data, diff_2 is seasonal difference of diff_1
  diff_1 = input_data %>% diff()
  diff_2 = diff_1 %>% diff(lag = 12)
  
  ### Let model_1 be the ARIMA model based on diff_1 and model_2 be the ARIMA model based on diff_2.
  ### ARMA subset summary does not give in descending order of BIC, so determination of the row with smallest BIC value is necessary. Fortunately, the 6th section of the summary is BIC value.
  BIC_values_position = 6
  
  ### AR and MA positions 
  ar_start = 2
  ar_end = ar_start + max_ar - 1
  ma_start = ar_end + 1
  ma_end = ma_start + max_ma - 1
  
  ### The 1st section of the summary is AR and MA part. TRUE indicates shaded box from the plot.
  AR_MA_position = 1
  
  ### Model 1
  model_1_BIC_value = h_get_BIC_value(diff_1,
                                      BIC_values_position,
                                      max_ar, max_ma)
  # Get shaded box logic value for AR part of model_1
  model_1_ar = h_get_AR_or_MA_shaded_box_logic_values(diff_1,
                                                      max_ar, max_ma,
                                                      AR_MA_position,
                                                      model_1_BIC_value,
                                                      ar_start, ar_end)
  # Get shaded box logic value for MA part of model_1
  model_1_ma = h_get_AR_or_MA_shaded_box_logic_values(diff_1,
                                                      max_ar, max_ma,
                                                      AR_MA_position,
                                                      model_1_BIC_value,
                                                      ma_start, ma_end)
  # Now, shaded boxes for model_1 is ready. Need to determine order and fixed position of AR and MA.
  model_1_ar_position = model_1_ar %>% equals(TRUE) %>% which()
  model_1_ar_order = h_get_model_AR_or_MA_order(model_1_ar_position)
  model_1_ma_position = model_1_ma %>% equals(TRUE) %>% which()
  model_1_ma_order = h_get_model_AR_or_MA_order(model_1_ma_position)
  model_1_fixed = 0 %>% rep(model_1_ar_order + model_1_ma_order)
  model_1_fixed[c(model_1_ar_position, model_1_ar_order + model_1_ma_position)] = NA
  # Set arima model
  model_1 = arima(input_data,
                  order = c(model_1_ar_order, 1, model_1_ma_order),
                  fixed = model_1_fixed,
                  transform.pars = FALSE)
  
  
  ### Model 2
  model_2_BIC_value = h_get_BIC_value(diff_2,
                                      BIC_values_position,
                                      max_ar, max_ma)
  model_2_ar = h_get_AR_or_MA_shaded_box_logic_values(diff_2,
                                                      max_ar, max_ma,
                                                      AR_MA_position,
                                                      model_2_BIC_value,
                                                      ar_start, ar_end)
  model_2_ma = h_get_AR_or_MA_shaded_box_logic_values(diff_2,
                                                      max_ar, max_ma,
                                                      AR_MA_position,
                                                      model_2_BIC_value,
                                                      ma_start, ma_end)
  model_2_ar_position = model_2_ar %>% equals(TRUE) %>% which()
  model_2_ar_order = h_get_model_AR_or_MA_order(model_2_ar_position)
  model_2_ma_position = model_2_ma %>% equals(TRUE) %>% which()
  model_2_ma_order = h_get_model_AR_or_MA_order(model_2_ma_position)
  model_2_fixed = 0 %>% rep(model_2_ar_order + model_2_ma_order)
  model_2_fixed[c(model_2_ar_position, model_2_ar_order + model_2_ma_position)] = NA
  model_2 = arima(input_data,
                  order = c(model_2_ar_order, 2, model_2_ma_order),
                  fixed = model_2_fixed,
                  transform.pars = FALSE)
  
  ### Compare BIC of both model_1 and model_2, and let the one with smaller BIC value be the optimize simulation model
  if (model_1 %>% BIC() %>% is_weakly_less_than(model_2 %>% BIC())) {
    arima_model = model_1
  }
  else {
    arima_model = model_2
  }
  return(arima_model)
}

# Detect outlier points and plot them out on the graph
detect_outliers = function(input_data){
  arima_model = build_arima_model(input_data)
  fitted_data = fitted(arima_model)
  upper = fitted_data + 2.5 * sqrt(arima_model %>% use_series(sigma2))
  lower = fitted_data - 2.5 * sqrt(arima_model %>% use_series(sigma2))
  outliers = input_data %>% is_less_than(lower) %>% or(input_data %>% is_greater_than(upper))
  
  # Plot graph
  input_data %>%
    plot(main = "Outlier Detection",
         ylab = "Oil Price",
         type = "n",
         ylim = range(lower, upper))
  polygon(c(input_data %>% time(), input_data %>% time() %>% rev()),
          c(upper, lower %>% rev()),
          col = rgb(0, 0, 0.6, 0.2),
          border = FALSE)
  input_data %>%
    lines()
  points(x = input_data %>% time() %>% extract(outliers),
         y = input_data %>% extract(outliers),
         pch = 19)
  
  # Print position of outliers
  paste0("Outliers at: ",
         outliers %>%
           equals(TRUE) %>%
           which() %>%
           paste(collapse = ", ")) %>%
    cat()
}

2. Demo

library(TSA)
data("oil.price")
input_data = oil.price

detect_outliers(input_data)

Outliers at: 2, 56, 180, 226, 227, 234