(I) Background

  • Instructor: Peng Wang, AVP, Head of Data Science - Operation & Fraud Detection at MassMutual Financial Group (Fall 2016)
  • The dataset used in this project is gapminder from Bioconnector. The dataset includes the data of Life Expectancy, Population, GDP per Capita of all the countries of each continent from 1952 to 2007 with an interval of 5 years.
  • Download Data
  • This project includes multiple visualizations of the dataset (Section II) and an interactive web application (Section III).

(II) Data Visualizations

gapminder_url = "https://bioconnector.github.io/workshops/data/gapminder.csv"

gapminder = read_csv(gapminder_url)
Years = gapminder %>% pull(year) %>% unique()
render_df = function(df) {
  row_size = df %>% dim() %>% magrittr::extract(1)
  output_table = df %>%
    kable(align = "c") %>%
    kable_styling(bootstrap_options = c("striped",
                                        "hover",
                                        "responsive",
                                        "condensed"),
                  fixed_thead = TRUE,
                  full_width = FALSE) %>%
    row_spec(0:row_size, extra_css = "vertical-align: middle;")
  return(output_table)
}

1. Number of countries per continent

  • How many unique countries are represented per continent?
df_1 = gapminder %>%
  select(continent, country) %>%
  group_by(continent) %>%
  summarise(country = country %>% n_distinct()) %>%
  rename(Continent = continent,
         Country = country)

df_1 %>% render_df()
Continent Country
Africa 52
Americas 25
Asia 33
Europe 30
Oceania 2
p_1 = ggplot(data = df_1, mapping = aes(x = Continent, y = Country))+
  geom_bar(stat = "identity", fill = "cornflowerblue", width = 0.5) +
  ggtitle("Country Number of Each Continent") +
  theme(plot.title = element_text(size = 20, hjust = 0.5))
p_1

2. Average life expectancy

  • According to the data available, what was the average Life Expectancy across each continent from 1952 to 2007?
df_2 = gapminder %>%
  select(continent, year, lifeExp) %>%
  group_by(continent, year) %>%
  summarise(`Average Life Expectancy` = mean(lifeExp)) %>%
  rename(Continent = continent, Year = year)

p_2 = ggplot(data = df_2, mapping = aes(x = Year, y = `Average Life Expectancy`, color = Continent)) +
  geom_point() +
  geom_line() +
  ggtitle("Average Life Expectancy per Continent") +
  ylab("Life Expectancy (Years)") +
  theme(plot.title = element_text(size = 20, hjust = 0.5)) +
  scale_x_continuous(labels = Years, breaks = Years)
p_2 %>% ggplotly()
  • What was the Life Expectancy for every countries in Americas?
df_3 = gapminder %>%
  filter(continent %>% equals("Americas")) %>%
  select(country, year, lifeExp) %>%
  group_by(country, year) %>%
  summarise(`Average Life Expectancy` = mean(lifeExp)) %>%
  rename(Country = country, Year = year)

p_3 = ggplot(data = df_3, mapping = aes(x = Year, y = `Average Life Expectancy`, color = Country)) +
  geom_point() +
  geom_line() +
  ggtitle("Average Life Expectancy in Americas") +
  ylab("Life Expectancy (Years)") +
  theme(plot.title = element_text(size = 20, hjust = 0.5)) +
  scale_x_continuous(labels = Years, breaks = Years)
p_3 %>% ggplotly()
  • What were the countries that have the longest average Life Expectancy in the world?
df_4 = gapminder %>%
  select(country, lifeExp) %>%
  group_by(country) %>%
  summarise(`Average Life Expectancy` = mean(lifeExp)) %>%
  rename(Country = country) %>%
  arrange(`Average Life Expectancy` %>% desc()) %>%
  slice(1:5)
df_4 %>% render_df()
Country Average Life Expectancy
Iceland 76.51142
Sweden 76.17700
Norway 75.84300
Netherlands 75.64850
Switzerland 75.56508
p_4 = ggplot(data = df_4, mapping = aes(x = Country, y = `Average Life Expectancy`))+
  geom_bar(stat = "identity", fill = "cornflowerblue", width = 0.5) +
  ggtitle("Countries with Longest Average Life Expectancy") +
  theme(plot.title = element_text(size = 20, hjust = 0.5))
p_4

  • What were the countries that have the shortest average Life Expectancy in the world?
df_5 = gapminder %>%
  select(country, lifeExp) %>%
  group_by(country) %>%
  summarise(`Average Life Expectancy` = mean(lifeExp)) %>%
  rename(Country = country) %>%
  arrange(`Average Life Expectancy`) %>%
  slice(1:5)
df_5 %>% render_df()
Country Average Life Expectancy
Sierra Leone 36.76917
Afghanistan 37.47883
Angola 37.88350
Guinea-Bissau 39.21025
Mozambique 40.37950
p_5 = ggplot(data = df_5, mapping = aes(x = Country, y = `Average Life Expectancy`))+
  geom_bar(stat = "identity", fill = "cornflowerblue", width = 0.5) +
  ggtitle("Countries with Shortest Average Life Expectancy") +
  theme(plot.title = element_text(size = 20, hjust = 0.5))
p_5

3. Average population

  • According to the data available, what was the average Population across each continent from 1952 to 2007?
df_6 = gapminder %>%
  select(continent, year, pop) %>%
  group_by(continent, year) %>%
  summarise(`Average Population` = mean(pop)) %>%
  rename(Continent = continent, Year = year)

population_labels = "0" %>% c(seq(from = 10, to = 120, by = 10) %>% paste0("M"))
popupation_breaks = seq(from = 0, to = 120, by = 10) * 10^6

p_6 = ggplot(data = df_6, mapping = aes(x = Year, y = `Average Population`, color = Continent)) +
  geom_point() +
  geom_line() +
  ggtitle("Average Population per Continent") +
  theme(plot.title = element_text(size = 20, hjust = 0.5)) +
  scale_y_continuous(labels = population_labels, breaks = popupation_breaks) +
  scale_x_continuous(labels = Years, breaks = Years)
p_6 %>% ggplotly()
  • What was the population for every countries in Americas?
df_7 = gapminder %>%
  filter(continent %>% equals("Americas")) %>%
  select(country, year, pop) %>%
  group_by(country, year) %>%
  summarise(`Average Population` = mean(pop)) %>%
  rename(Country = country, Year = year)

population_labels = "0" %>% c(seq(from = 30, to = 300, by = 30) %>% paste0("M"))
popupation_breaks = seq(from = 0, to = 300, by = 30) * 10^6

p_7 = ggplot(data = df_7, mapping = aes(x = Year, y = `Average Population`, color = Country)) +
  geom_point() +
  geom_line() +
  ggtitle("Average Population in Americas") +
  theme(plot.title = element_text(size = 20, hjust = 0.5)) +
  scale_y_continuous(labels = population_labels, breaks = popupation_breaks) +
  scale_x_continuous(labels = Years, breaks = Years)
p_7 %>% ggplotly()

4. Average GDP per Capita

  • According to the data available, what was the average GDP per Capita across each continent from 1952 to 2007?
df_8 = gapminder %>%
  select(continent, year, gdpPercap) %>%
  group_by(continent, year) %>%
  summarise(`Average GDP per Capita` = mean(gdpPercap)) %>%
  rename(Continent = continent, Year = year)

p_8 = ggplot(data = df_8, mapping = aes(x = Year, y = `Average GDP per Capita`, color = Continent)) +
  geom_point() +
  geom_line() +
  ggtitle("Average GDP per Capita per Continent") +
  theme(plot.title = element_text(size = 20, hjust = 0.5)) +
  scale_x_continuous(labels = Years, breaks = Years)
p_8 %>% ggplotly()
  • What was the GDP Per Capita for every countries in Americas?
df_9 = gapminder %>%
  filter(continent %>% equals("Americas")) %>%
  select(country, year, gdpPercap) %>%
  group_by(country, year) %>%
  summarise(`Average GDP per Capita` = mean(gdpPercap)) %>%
  rename(Country = country, Year = year)

gdpPercap_labels = "0" %>% c(seq(from = 5, to = 45, by = 5) %>% paste0("K"))
gdpPercap_breaks = seq(from = 0, to = 45, by = 5) * 10^3

p_9 = ggplot(data = df_9, mapping = aes(x = Year, y = `Average GDP per Capita`, color = Country)) +
  geom_point() +
  geom_line() +
  ggtitle("Average GDP per Capita in Americas") +
  theme(plot.title = element_text(size = 20, hjust = 0.5)) +
  scale_y_continuous(labels = gdpPercap_labels, breaks = gdpPercap_breaks) +
  scale_x_continuous(labels = Years, breaks = Years)
p_9 %>% ggplotly()

(III) Shiny Application