# for data wrangling (coming up with new columns, filtering rows...)
library(dplyr)
# for plotting graphics
library(ggplot2)
# for reading the shapefile
library(rgdal)
# join function for merging datasets
library(plyr)
# interactive map visualization
library(leaflet)
# color palette to show distribution of data
library(RColorBrewer)
We need two datasets to complete this tutorial: Kenya Country shapefile with administrative boundaries KNBS Dataset that has the GCP data that we need to visualize on the shapefile.
# load in the gcp dataset
gcp_df <- read.csv('datasets/gcp-current.csv', header=TRUE)
gcp_df <- gcp_df[c(1:47),]
names(gcp_df) <- c("COUNTY","YEAR_2013","YEAR_2014","YEAR_2015","YEAR_2016","YEAR_2017","YEAR_2018","YEAR_2019","YEAR_2020")
head(gcp_df)
## COUNTY YEAR_2013 YEAR_2014 YEAR_2015 YEAR_2016 YEAR_2017 YEAR_2018
## 1 BARINGO 35852 42040 47684 51259 59403 61894
## 2 BOMET 64268 67305 85548 100542 120512 133969
## 3 BUNGOMA 99597 118448 126232 137028 163810 177336
## 4 BUSIA 40108 46062 51610 57251 64007 71210
## 5 ELGEYO MARAKWET 36579 45062 46129 57669 70844 88863
## 6 EMBU 81946 87820 94830 110119 119393 134488
## YEAR_2019 YEAR_2020
## 1 71544 76636
## 2 137691 152744
## 3 193711 206705
## 4 83280 90817
## 5 104844 117047
## 6 138867 153927
Add a new column for the average GCP for years 2013-2020
# calculate average gcp per county for years from 2013 to 2020
gcp_df <- gcp_df %>%
mutate(gcp_mean=rowMeans(select(gcp_df, starts_with("YEAR_"))))
head(gcp_df)
## COUNTY YEAR_2013 YEAR_2014 YEAR_2015 YEAR_2016 YEAR_2017 YEAR_2018
## 1 BARINGO 35852 42040 47684 51259 59403 61894
## 2 BOMET 64268 67305 85548 100542 120512 133969
## 3 BUNGOMA 99597 118448 126232 137028 163810 177336
## 4 BUSIA 40108 46062 51610 57251 64007 71210
## 5 ELGEYO MARAKWET 36579 45062 46129 57669 70844 88863
## 6 EMBU 81946 87820 94830 110119 119393 134488
## YEAR_2019 YEAR_2020 gcp_mean
## 1 71544 76636 55789.00
## 2 137691 152744 107822.38
## 3 193711 206705 152858.38
## 4 83280 90817 63043.12
## 5 104844 117047 70879.62
## 6 138867 153927 115173.75
The shapefile data is available in shp format from various sources. In this case shapefile was obtained from openAFRICA
Use rgdal package to load in the shapefile and plot it to visualize county level administrative boundaries.
# load in the shapefile
shp <- readOGR(dsn="kenyan-counties", layer="County")
plot(shp)
Let’s peek on the shp data for any insights.
# have a look at the format of the shapefile data
head(shp@data)
## OBJECTID AREA PERIMETER COUNTY3_ COUNTY3_ID COUNTY Shape_Leng Shape_Area
## 0 1 5.677 15.047 2 1 Turkana 15.046838 5.6769851
## 1 2 6.177 11.974 3 2 Marsabit 11.974165 6.1768307
## 2 3 2.117 7.355 4 3 Mandera 7.355154 2.1171961
## 3 4 4.610 9.838 5 4 Wajir 9.838408 4.6095892
## 4 5 0.740 5.030 6 5 West Pokot 5.030271 0.7404806
## 5 6 1.713 8.311 7 6 Samburu 8.311013 1.7130145
From the above output, we can see that the COUNTY column of the shp data is in title case. Note that in the KNBS GCP dataset the COUNTY column is in upper case thus causing a mismatch.
We start by ensuring that the COUNTY column for both
datasets match. We achieve this by converting the shapefile COUNTY
column into uppercase. Then we apply the setdiff()
function
to compare the columns from the two datasets to allow us to see which
rows (counties) have mismatching names.
# convert shapefile county names to uppercase to match KNBS dataset
shp$COUNTY <- stringr::str_to_upper(shp$COUNTY)
# inspect to ensure shapefile county names match gcp dataset county names
setdiff(shp$COUNTY, gcp_df$COUNTY)
## [1] "KEIYO-MARAKWET" "THARAKA" "MURANG'A"
The output suggests that three counties have mismatching names and thus need to be fixed. Update the shapefile county names to ensure that they match names as used in the KNBS GCP dataset.
# inconsistencies: KEIYO-MARAKWET, UASIN GISHU, THARAKA, MURANG'A
marakwet_id <- which(shp$COUNTY=="KEIYO-MARAKWET")
shp$COUNTY[marakwet_id] <- "ELGEYO MARAKWET"
tharaka_id <- which(shp$COUNTY=="THARAKA")
shp$COUNTY[tharaka_id] <- "THARAKA NITHI"
muranga_id <- which(shp$COUNTY=="MURANG'A")
shp$COUNTY[muranga_id] <- "MURANGA"
# inspect to ensure that all county names match in both shapefile and the knbs dataset
setdiff(shp$COUNTY, gcp_df$COUNTY)
## character(0)
From the above output we see that all counties have matching names and this will ensure that we can join both datasets on the county names.
The code below is used to join the shapefile dataset with the GCP
dataset on the COUNTY column. We also convert the
shapefile into a dataframe using the fortify()
function to
make it compatible with ggplot format.
shp@data$id <- rownames(shp@data)
shp@data <- join(shp@data, gcp_df, by="COUNTY")
shp_df <- fortify(shp)
## Regions defined for each Polygons
kenya_df <- join(shp_df, shp@data, by="id")
Now that our datasets are clean we can readily visualize the county data on a map.
We first come up with a basic ggplot map to visualize county borders before we move to customizing the graphics.
# use ggplot to visualize contribution of gcp for various counties
ggplot() +
geom_polygon(data=kenya_df, aes(long, lat, group=group, fill=gcp_mean), colour="grey", linewidth=0.5) +
geom_path(data=kenya_df, aes(long, lat, group=group), color="black", linewidth=0.5)
From the above map, it is easy to see that Nairobi County GCP is different from the GCP of other counties with a big range. It is hard to contrast the difference between GCP of different counties with such a disparity in values.
We fix this range problem by converting the GCP values to their
natural log using the log()
function.
Come up with a better ggplot using the log of the average of the gcp
# use ggplot to visualize contribution of gcp for various counties
ggplot() +
geom_polygon(data=kenya_df, aes(long, lat, group=group, fill=log(gcp_mean)), colour="grey", linewidth=0.5) +
geom_path(data=kenya_df, aes(long, lat, group=group), color="black", linewidth=0.5) +
labs(title="Map of County GCP (Using log of GCP)")
From the above map we can easily contrast the GCP of different counties. Higher GCP is represented by lighter blue color while lower GCP is represented by dark blue color.
Contrast the color for Nairobi County with that of other counties in the northern region. We can conclude that using the Natural Log of the GCP will enhance the quality of our visualization.
Enhance your ggplot to come up with final ggplot map. This includes using a sequential color palette to allow easy contrast for different values of the GCP. We also add titles and subtitle that match the KNBS color themes.
Here is the code for the final ggplot map.
# final ggplot2 map
ggplot() +
geom_polygon(data=kenya_df, aes(long, lat, group=group, fill=log(gcp_mean)), colour="grey", linewidth=0.5) +
geom_path(data=kenya_df, aes(long, lat, group=group), color="black", linewidth=0.5) +
scale_fill_distiller(name="Log GCP", palette="YlOrBr")+
labs(title="Gross County Product Map", subtitle="Average GCP for years 2013 - 2020", caption="Use Natural Log of the GCP to minimize effect of disparity") +
theme_void() +
theme(
plot.title = element_text(hjust=0.5, size=17, face="bold"),
plot.caption = element_text(color="#edae49", size=13, hjust=0.5, face="italic"),
plot.subtitle = element_text(vjust=0.5, hjust=0.5, size=15, face="italic", family="Helvetica", color="brown"))
The advantage of using leaflet is that it allows the user to move the cursor on the map and the map provides values for each county where the user’s mouse is hovering.
Start by plotting a basic leaflet chart. To produce a good leaflet chart for Kenya, we need to set constraints for the latitude and longitude that bounds the country.
Since the country is within the Equator region the latitude ranges from +0 or -0. For the longitudes we bound it from 33 degrees to 41 degrees. Given these bounds the output map will focus on the Kenya region.
Try hovering over the country and see the change in county names as you move from one administrative boundary to the next.
# create a basic leaflet object to visualize county borders
leaflet() %>%
addProviderTiles(providers$Esri.WorldGrayCanvas, group="Default Maptile", options=providerTileOptions(noWrap=TRUE)) %>%
fitBounds(33.97, -4.471, 41.85688, 3.93726) %>%
setMaxBounds(32, -3.9, 43, 4.5) %>%
setView(lng=37.9062, lat=1.00, zoom=6) %>%
addPolygons(data=shp, col="blue", weight=1, layerId = ~id, label=~COUNTY)
Now that we have successfully produced a leaflet visualization showing the Country we can move ahead to enhance the visualization and also display the different GCP values as recorded on the GCP dataset.
Come up with color palette for enhancing contrast We will use different colors to show the distribution of GCP across different counties. We use a continuous color palette for this task. In this case the number 25 is chosen randomly and you should try using different values.
# use a continuous color palette
qpal <- colorQuantile(rev(viridis::viridis(25,option="G")),
shp$YEAR_2013, n=25)
qpal(shp$YEAR_2013)
## [1] "#359EAA" "#B1E4C2" "#96DDB5" "#B1E4C2" "#5BCDAD" "#DEF5E5" "#DEF5E5"
## [8] "#5BCDAD" "#49C1AD" "#3B5698" "#3A2C58" "#96DDB5" "#2B1C35" "#342346"
## [15] "#38AAAC" "#3FB6AD" "#211423" "#40498E" "#38AAAC" "#170C14" "#3FB6AD"
## [22] "#359EAA" "#78D6AE" "#413E7E" "#2B1C35" "#3E356B" "#C9ECD3" "#3486A5"
## [29] "#38629D" "#40498E" "#366FA0" "#366FA0" "#3492A8" "#3B5698" "#413E7E"
## [36] "#3A2C58" "#357BA2" "#0B0405" "#211423" "#38629D" "#0B0405" "#3492A8"
## [43] "#C9ECD3" "#342346" "#78D6AE" "#3486A5" "#170C14"
When displaying the leaflet we want the user to easily see the GCP values for different counties as they hover over the map. Create labels to be used when interacting with map to show county values.
# create text for county labels
mylabels <- paste(
"County: ", shp$COUNTY,"<br/>",
"GCP: ", prettyNum(shp$YEAR_2013, big.mark=",")
) %>% lapply(htmltools::HTML)
# plot title
plot_title <- "<h4>Gross County Product (GCP) For Year 2013, in Kshs Million</h4>"
Now that we have the color palette and labels, we move on to create the final leaflet visualization. Also give the plot a title to allow for intuitive interpretation of the visualized map.
Final Enhanced map
# FINAL MAP
leaflet() %>%
addProviderTiles(providers$Esri.WorldGrayCanvas, group="Default Maptile", options=providerTileOptions(noWrap=TRUE)) %>%
fitBounds(33.97, -4.471, 41.85688, 3.93726) %>%
setMaxBounds(32, -3.9, 43, 4.5) %>%
setView(lng=37.9062, lat=1.00, zoom=6) %>%
addPolygons(
data=shp,
col="whitesmoke",
weight=1,
label=mylabels,
labelOptions = labelOptions(
style=list("font-weight"="bold", padding="3px 8px"),
direction="auto"
),
layerId = ~id,
fillOpacity=1,
fillColor=~qpal(YEAR_2013),
highlightOptions = highlightOptions(
color="#000000", weight=2,
bringToFront=TRUE, sendToBack = TRUE
)
) %>%
addControl(html=plot_title, position="topright")
Now that we have produced the GCP visualizations we should let the user choose between different years since data provided includes years 2013-2020.
This will be accomplished on a Shiny App. Below is a preview of the Interactive Shiny app that allows a user to choose between different years and also visualize GCP contribution by different sectors of the economy.
Visit the Shiny app to explore how different sectors of the economy contributed to the GCP of various counties. You can also compare top counties per sector.