1 Background

  • I joined Prof. Hong-Kun Zhang’s research group in the field of financial mathematics when I was at UMass Amherst. At that time, the data were extracted from Yahoo Finance.
  • After completing my Master’s program at Cornell, I decided to improve my skills in R, including:
    • Web wrangling
    • Regex and string manipulation
    • Function
    • Tidyverse ecosystem syntax
    • Shiny (dashboard)
  • So, in late 2019, I built a Shiny application (version 1) using yahoo finance data and deployed it on RStudio’s shinyapps.io.
  • In 2022, I found the app can barely run. There are warnings when opening the app, and both chart and table could not display properly (may cause by a bug from quantmod::getSymbols). Also, the data behind Yahoo Finance’s source code also had a big change, and many of the parsing methods I designed may not compatible either. I decided to build a new version to fix all the problems and added some new features to the dashboard.

2 Design

2.1 User Inputs

user_inputs
 |-- symbol
 |    |-- default_symbol
 |    |-- customized_symbol
 |-- name
 |-- market_price
 |-- market_change
 |-- market_change_percent
 |-- currency
 |-- time_period
 |    |-- default_time_period
 |    |-- customized_time_period
 |-- frequency
 |-- dataframe_columns
 |-- visualization_type

2.1.1 Exchange Index

exchange_index
 |-- default_index
 |-- customized_index
default_index:
- ^DJI
- ^GSPC
- ^IXIC
- ^RUT
- ^FTSE
- ^N225
- 000001.SS
- 399106.SZ
- ^TNX
- BTC-USD
customized_index:
- C (default)

3 Implementation

3.1 Helper Functions

3.1.1 Read Source Code

read_source_code = function(symbol) {
  web_link = paste0("https://finance.yahoo.com/quote/", symbol)
  source_code = read_lines(web_link)
  return(source_code)
}

3.1.2 Extract Info

extract_main_info_line = function(source_code) {
  h1_class_locations = which(source_code %>%
                               str_detect(pattern = "h1 class") %>%
                               equals(TRUE))
  
  ### "h1 class" not in source code caused by invalid input exchange code
  if (h1_class_locations %>% length() %>% equals(0)) {
    # cat("h1 class has no location, the exchange code might be invalid")
    return(NA)
  }
  
  ### pick the first "h1 class" line
  if (h1_class_locations %>% length() %>% equals(1)) {
    main_info_line_index = h1_class_locations
  } else {
    main_info_line_index = h1_class_locations %>% extract(1)
    # cat("h1 class has multiple locations")
  }
  
  main_info_line = source_code %>% extract(main_info_line_index)
  return(main_info_line)
}

3.1.3 Extract Name

h_clean_name = function(raw_name) {
  if (raw_name %>% str_detect(pattern = "&")) {
    cleaned_name = raw_name %>%
      str_replace(pattern = "&", replacement = "&")
  } else {
    cleaned_name = raw_name
  }
  return(cleaned_name)
}

extract_name = function(main_info_line) {
  ### main_info_line is NA caused by invalid input exchange code
  if (main_info_line %>% is.na()) {
    return(NA)
  }
  
  name = main_info_line %>%
    str_split(pattern = "<h1 class") %>% extract2(1) %>%
    extract(2) %>%
    str_split(pattern = "</h1>") %>% extract2(1) %>%
    extract(1) %>%
    str_split(pattern = ">") %>% extract2(1) %>%
    extract(2) %>%
    h_clean_name()
  return(name)
}

3.1.4 Extract Market Price

extract_market_price = function(main_info_line) {
  ### main_info_line is NA caused by invalid input exchange code
  if (main_info_line %>% is.na()) {
    return(NA)
  }
  
  market_price = main_info_line %>%
    str_split(pattern = "regularMarketPrice") %>% extract2(1) %>%
    extract(2) %>%
    str_split(pattern = "</fin-streamer") %>% extract2(1) %>%
    extract(1) %>%
    str_split(pattern = ">") %>% extract2(1) %>%
    extract(2)
  return(market_price)
}

3.1.5 Extract Market Change

extract_market_change = function(main_info_line) {
  ### main_info_line is NA caused by invalid input exchange code
  if (main_info_line %>% is.na()) {
    return(NA)
  }
  
  market_change = main_info_line %>%
    str_split(pattern = "regularMarketChange") %>% extract2(1) %>%
    extract(2) %>%
    str_split(pattern = "</fin-streamer") %>% extract2(1) %>%
    extract(1) %>%
    str_split(pattern = ">") %>% extract2(1) %>%
    extract(3) %>%
    str_split(pattern = "<") %>% extract2(1) %>%
    extract(1)
  return(market_change)
}

3.1.6 Extract Market Change Percent

extract_market_change_percent = function(main_info_line) {
  ### main_info_line is NA caused by invalid input exchange code
  if (main_info_line %>% is.na()) {
    return(NA)
  }
  
  market_change_percent = main_info_line %>%
    str_split(pattern = "regularMarketChangePercent") %>% extract2(1) %>%
    extract(2) %>%
    str_split(pattern = "</fin-streamer") %>% extract2(1) %>%
    extract(1) %>%
    str_split(pattern = ">") %>% extract2(1) %>%
    extract(3) %>%
    str_split(pattern = "<") %>% extract2(1) %>%
    extract(1)
  # %>%
  #   str_split(pattern = "\\(") %>% extract2(1) %>%
  #   extract(2) %>%
  #   str_split(pattern = "\\)") %>% extract2(1) %>%
  #   extract(1)
  return(market_change_percent)
}

3.1.7 Extract Currency

extract_currency = function(main_info_line) {
  ### main_info_line is NA caused by invalid input exchange code
  if (main_info_line %>% is.na()) {
    return(NA)
  }
  
  currency = main_info_line %>%
    str_split(pattern = "Currency in ") %>% extract2(1) %>%
    extract(2) %>%
    str_split(pattern = "<") %>% extract2(1) %>%
    extract(1)
  return(currency)
}

3.1.8 Extract Data

h_clean_df_colname = function(col_name) {
  cleaned_col_name = col_name %>%
    str_split(pattern = "\\.") %>% extract2(1)
  cleaned_col_name = cleaned_col_name %>%
    extract(cleaned_col_name %>% length())
  return(cleaned_col_name)
}

extract_data = function(symbol, from_date = today() - years(1), to_date = today(), frequency = "daily") {
  df = getSymbols(Symbols = symbol,
                  from = from_date,
                  to = to_date,
                  periodicity = frequency,
                  return.class = "xts",
                  auto.assign = FALSE) %>%
    as.data.frame()
  colnames(df) = colnames(df) %>% sapply(h_clean_df_colname)
  df = df %>%
    mutate(Date = df %>% rownames()) %>%
    mutate(Date = Date %>% as.Date(),
           Open = Open %>% round(digits = 2),
           High = High %>% round(digits = 2),
           Low = Low %>% round(digits = 2),
           Close = Close %>% round(digits = 2),
           Adjusted = Adjusted %>% round(digits = 2)) %>%
    arrange(Date %>% desc())
  rownames(df) = NULL
  df = df %>%
    select(Date, Open, High, Low, Close, Adjusted, Volume) %>%
    rename(`Adjusted Close` = Adjusted)
  return(df)
}

3.2 test

3.2.1 test_symbols

test_symbols = c("^DJI",
                 "^GSPC",
                 "^IXIC",
                 "^RUT",
                 "^FTSE",
                 "^N225",
                 "000001.SS",
                 "399001.SZ",
                 "^TNX",
                 "IVV")
# test_symbols = test_symbols[1:3]

3.2.2 df1: string extraction

test_source_codes = test_symbols %>% lapply(read_source_code)
test_main_info_lines = test_source_codes %>% lapply(extract_main_info_line)
test_names = test_main_info_lines %>% sapply(extract_name)
test_market_prices = test_main_info_lines %>% sapply(extract_market_price)
test_market_changes = test_main_info_lines %>% sapply(extract_market_change)
test_market_change_percents = test_main_info_lines %>% sapply(extract_market_change_percent)
test_currencies = test_main_info_lines %>% sapply(extract_currency)
df = tibble(`Name` = test_names,
            `Market Price` = test_market_prices,
            `Market Change` = test_market_changes,
            `Market Change Percent` = test_market_change_percents,
            `Currency` = test_currencies)
df

3.2.3 df2: data extraction

# test_symbols[1]
df2 = test_symbols[8] %>% extract_data()
# df2 = getSymbols(Symbols = test_symbols[7],
#                   from = "2022-01-01",
#                   to = "2022-08-01",
#                   periodicity = "daily",
#                   return.class = "xts",
#                   auto.assign = FALSE)
df2

3.2.4 p1: static plot

p1 = ggplot(data = df2,
            mapping = aes(x = Date, y = `Adjusted Close`)) +
      labs(y = "Adjusted Close") +
      geom_line(color = "DodgerBlue", size = 1.2)

p1

3.2.5 p2: interactive plot

p2 = plot_ly(data = df2,
             x = ~Date,
             width = 800,
             height = 480) %>%
  add_trace(y = ~`Adjusted Close`,
            type = "scatter",
            mode = "lines",
            name = "Adjusted Close",
            line = list(color = "Orange",
                        width = 2)
            ) %>%
  add_trace(y = ~Open,
            type = "scatter",
            mode = "lines",
            name = "Open",
            visible = "legendonly",
            line = list(color = "LimeGreen",
                        width = 2,
                        dash = "dash")
            ) %>%
  add_trace(y = ~High,
            type = "scatter",
            mode = "lines",
            name = "High",
            line = list(color = "DodgerBlue",
                        width = 1,
                        dash = "dot")
            ) %>%
  add_trace(y = ~Low,
            type = "scatter",
            mode = "lines",
            name = "Low",
            line = list(color = "Crimson",
                        width = 1,
                        dash = "dot")
            ) %>%
  layout(font = list(family = "Helvetica",
                     size = 12,
                     color = "Black"),
         title = paste("From",
                       min(df2$Date),
                       "to",
                       max(df2$Date)),
         xaxis = list(title = "Date"),
         yaxis = list(title = "Value"),
         legend = list(x = 1,
                       y = 0.9,
                       title = list(
                         text = "Value Type",
                         font = list(family = "Helvetica",
                                     size = 12,
                                     color = "Blue")),
                       bgcolor = "rgba(0,0,0,0)"),
         plot_bgcolor = "WhiteSmoke")

p2

3.3 Main Functions

3.4 Shiny


Copyright © 2022 Harry Lu. All rights reserved.