Please report any issues here.
Messages from clusso
have been suppressed for this tutorial, but there are many messages generated that we hope the user will find helpful. Feedback is welcome here.
This example uses simulated data which are meant for illustration of how to use clusso
with maps in R
.
Load clusso
, as well as the tidyverse
, tigris
, sf
, tmap
, and sp
packages:
library("clusso")
library(tidyverse)
library(tigris)
library(sf)
library(tmap)
library(sp)
If you haven't already, you can install clusso
directly from GitHub:
library("devtools")
devtools::install_github("mkamenet3/clusso")
Using the counties()
function from the tigris
package, we load a county-level map of Wisconsin. We fortify()
the map and create wi_map
in order to use it with ggplot2
:
wi <- tigris::counties("Wisconsin", cb=TRUE)
##
|
| | 0%
|
| | 1%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|== | 3%
|
|== | 4%
|
|=== | 4%
|
|=== | 5%
|
|==== | 5%
|
|==== | 6%
|
|===== | 7%
|
|===== | 8%
|
|====== | 8%
|
|====== | 9%
|
|======= | 9%
|
|======= | 10%
|
|======= | 11%
|
|======== | 11%
|
|======== | 12%
|
|========= | 12%
|
|========= | 13%
|
|========= | 14%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 15%
|
|=========== | 16%
|
|============ | 17%
|
|============ | 18%
|
|============= | 18%
|
|============= | 19%
|
|============== | 19%
|
|============== | 20%
|
|============== | 21%
|
|=============== | 21%
|
|=============== | 22%
|
|================ | 22%
|
|================ | 23%
|
|================ | 24%
|
|================= | 24%
|
|================= | 25%
|
|================== | 25%
|
|================== | 26%
|
|=================== | 27%
|
|=================== | 28%
|
|==================== | 28%
|
|==================== | 29%
|
|===================== | 29%
|
|===================== | 30%
|
|===================== | 31%
|
|====================== | 31%
|
|====================== | 32%
|
|======================= | 32%
|
|======================= | 33%
|
|======================== | 34%
|
|======================== | 35%
|
|========================= | 35%
|
|========================= | 36%
|
|========================== | 37%
|
|========================== | 38%
|
|=========================== | 38%
|
|=========================== | 39%
|
|============================ | 39%
|
|============================ | 40%
|
|============================ | 41%
|
|============================= | 41%
|
|============================= | 42%
|
|============================== | 42%
|
|============================== | 43%
|
|=============================== | 44%
|
|=============================== | 45%
|
|================================ | 45%
|
|================================ | 46%
|
|================================= | 46%
|
|================================= | 47%
|
|================================= | 48%
|
|================================== | 48%
|
|================================== | 49%
|
|=================================== | 49%
|
|=================================== | 50%
|
|=================================== | 51%
|
|==================================== | 51%
|
|==================================== | 52%
|
|===================================== | 52%
|
|===================================== | 53%
|
|===================================== | 54%
|
|====================================== | 54%
|
|====================================== | 55%
|
|======================================= | 55%
|
|======================================= | 56%
|
|======================================== | 56%
|
|======================================== | 57%
|
|======================================== | 58%
|
|========================================= | 58%
|
|========================================= | 59%
|
|========================================== | 59%
|
|========================================== | 60%
|
|========================================== | 61%
|
|=========================================== | 61%
|
|=========================================== | 62%
|
|============================================ | 62%
|
|============================================ | 63%
|
|============================================= | 64%
|
|============================================= | 65%
|
|============================================== | 65%
|
|============================================== | 66%
|
|=============================================== | 66%
|
|=============================================== | 67%
|
|=============================================== | 68%
|
|================================================ | 68%
|
|================================================ | 69%
|
|================================================= | 69%
|
|================================================= | 70%
|
|================================================= | 71%
|
|================================================== | 71%
|
|================================================== | 72%
|
|=================================================== | 72%
|
|=================================================== | 73%
|
|==================================================== | 74%
|
|==================================================== | 75%
|
|===================================================== | 75%
|
|===================================================== | 76%
|
|====================================================== | 76%
|
|====================================================== | 77%
|
|====================================================== | 78%
|
|======================================================= | 78%
|
|======================================================= | 79%
|
|======================================================== | 79%
|
|======================================================== | 80%
|
|======================================================== | 81%
|
|========================================================= | 81%
|
|========================================================= | 82%
|
|========================================================== | 82%
|
|========================================================== | 83%
|
|=========================================================== | 84%
|
|=========================================================== | 85%
|
|============================================================ | 85%
|
|============================================================ | 86%
|
|============================================================= | 86%
|
|============================================================= | 87%
|
|============================================================= | 88%
|
|============================================================== | 88%
|
|============================================================== | 89%
|
|=============================================================== | 89%
|
|=============================================================== | 90%
|
|=============================================================== | 91%
|
|================================================================ | 91%
|
|================================================================ | 92%
|
|================================================================= | 92%
|
|================================================================= | 93%
|
|================================================================= | 94%
|
|================================================================== | 94%
|
|================================================================== | 95%
|
|=================================================================== | 95%
|
|=================================================================== | 96%
|
|==================================================================== | 97%
|
|==================================================================== | 98%
|
|===================================================================== | 98%
|
|===================================================================== | 99%
|
|======================================================================| 99%
|
|======================================================================| 100%
wi_map <- fortify(wi)
To plot the counties of Wisconsin, we use geom_map()
from the ggplot2
package:
p1 <- ggplot() + geom_map(data=wi_map, map=wi_map,
aes(x=long, y=lat, map_id=id),
color="black", fill="white", size=0.25) +
coord_map() +
ylab("Latitude") +
xlab("Longitude")
p1
Next, we simulate some data. For this example, we imagine that in each county there were a total number of people sampled (ntrials), some of which have outcome X (ncases). We assume the total number of people sampled come from a negative binomial distribution and use rnbinom()
to simulate ntrials. We assume cases follow a binomial distribution and use rbinom()
to draw cases from the population with probability \(\pi\). For illustration of clusso
, we put a cluster with elevated rates in Wood, Portage, Waushera, Adams, Juneau, and Marquette counties. We will only be looking at space for this example so there is only a single time period.
#create ncases and ntrials for 72 counties in Wisconsin
set.seed(2)
ncounties <- length(unique(wi@data$NAME))
ntrials <- rnbinom(n=ncounties, mu=1000, size =10)
dat <- cbind.data.frame(id=unique(wi@data$NAME),ntrials)
dat <- dat %>%
mutate(ncases = if_else(id %in% c("Wood", "Portage", "Waushara", "Adams", "Juneau","Marquette"),
rbinom(ncounties,ntrials, 0.9),
rbinom(ncounties,ntrials, 0.1)),
time = "Period1")
str(dat)
## 'data.frame': 72 obs. of 4 variables:
## $ id : chr "Ashland" "Grant" "Sawyer" "Crawford" ...
## $ ntrials: num 668 636 737 743 1262 ...
## $ ncases : int 71 66 60 67 106 67 131 89 54 680 ...
## $ time : chr "Period1" "Period1" "Period1" "Period1" ...
The dataframe dat
, that contains the county name (id), number of people sampled (ntrials), number of people with outcome X (ncases), and number of time periods (time).
Next, we need to merge dat
with the SpatialPolygonsDataFrame
, wi
. We use merge()
from the sp
package to combine the two by county name (county name is the variable NAME in wi
and is id in dat
).
#merge to map
widat <- sp::merge(wi,dat, by.x="NAME", by.y="id", duplicateGeoms=TRUE)
class(widat)
## [1] "SpatialPolygonsDataFrame"
## attr(,"package")
## [1] "sp"
head(widat)
## NAME STATEFP COUNTYFP COUNTYNS AFFGEOID GEOID LSAD ALAND
## 2 Ashland 55 003 01581061 0500000US55003 55003 06 2706476264
## 22 Grant 55 043 01581081 0500000US55043 55043 06 2970403781
## 57 Sawyer 55 113 01581116 0500000US55113 55113 06 3257216619
## 12 Crawford 55 023 01581071 0500000US55023 55023 06 1477970887
## 19 Florence 55 037 01581078 0500000US55037 55037 06 1264232458
## 26 Iron 55 051 01581085 0500000US55051 55051 06 1963699141
## AWATER ntrials ncases time
## 2 3230710864 668 71 Period1
## 22 94520011 636 66 Period1
## 57 239883874 737 60 Period1
## 12 73976438 743 67 Period1
## 19 24111636 1262 106 Period1
## 26 416924548 773 67 Period1
In order to extract the centroid from each county, we first convert our new SpatialPolygonsDataFrame
to a simple features (sf) object using st_as_sf()
from the sf
package. It is possible to extract centroids from the SpatialPolygonsDataFrame
, but we do so here in order to also illustrate the sf
package:
#convert to simple features sf
widatsf <- st_as_sf(widat)
class(widatsf)
## [1] "sf" "data.frame"
To prepare widatsf
for use with clusso()
, we convert time
and NAME
to factors:
widatsf$time <- as.factor(widatsf$time)
widatsf$NAME <- as.factor(widatsf$NAME)
We extract the centroids using st_centroid()
and extract the coordinates using st_coordinates()
; both functions are from the sf
package.
coords <- as.data.frame(st_coordinates(st_centroid(widatsf)))
head(coords)
## X Y
## 1 -90.67794 46.31608
## 2 -90.70620 42.86748
## 3 -91.14454 45.87998
## 4 -90.93104 43.23947
## 5 -88.39810 45.84846
## 6 -90.24206 46.26227
We can plot the centroids on top of the base map we created above, p1:
p1 + geom_point(data=coords,aes(x=X, y=Y), color="red", shape=3)
We can visualize the proportion of cases from the total number of trials by creating a new variable, prop
:
widatsf <- widatsf %>%
mutate(prop = ncases/ntrials)
ggplot(data=widatsf) +
geom_histogram(aes(x=prop),binwidth = 0.01,
fill="grey", color="black") +
theme_minimal() +
xlab("Proportion") +
ggtitle("Histogram of Proportion Cases/Trials")
We visualize the proportion on the Wisconsin map. We take the natural log of the proportion to better see small differences at the lower end in the proportion:
ggplot(widatsf) +
geom_sf(aes(fill=log(prop))) +
scale_fill_viridis_c() +
xlab("Longitude") +
ylab("Latitude") +
ggtitle("(Log) Proportion of Cases/Trials in Wisconsin")
clusso
We first set global parameters, namely the maximum radius size for each potential cluster (rMax <- 100
), the number of time periods to 1 (Time <- 1
). We also extract our x and y coordinates, which are in longitude and latitude, and set them as xx
and yy
.
To perform the regularized spatial and spatio-temporal cluster detection using the Lasso penalty, we use the function clusso()
. In the first argument, we set the name of the dataframe, df=widatsf
. In the second argument we set expected
counts equal to the total number of trials in the widatsf
dataframe (expected = ntrials
). In the third argument, we set the observed counts equal to the total number of cases in each county in the widatsf
dataframe (observed = ncases
). In the fourth argument, we set the timeperiod equal to the name of the time periods in the widatsf
dataframe (timeperiod = time
). In this example, we do not want to include other covariates as un-penalized terms in the model so we set covars = FALSE
.
The next two arguments are x
and y
, which are the longitude and latitude for each centroids. Since these are longitude and latitude coordinates, we also set utm=FALSE
. This will tell clusso
to use the great-circle-distance or Haversine distance to calculate the distance between a centroid up to rMax
.
The next argument is the maximum radius (in km) for the potential clusters. We set rMax = rMax
, which we defined above as 100km. The argument utm
specifies that our coordinates are NOT in UTM coordinates (utm = FALSE
). The is analysis
argument defines the type of analysis to be performed. Options include space
, spacetime
, or both
. Here, we set analysis
equal to space
, which will perform only the spatial analysis. The argument model
is the type of model to be used. In this example, we specify the binomial model. The argument maxclust
is the maximum number of actual clusters expected in the given area. This is based on scientific knowledge of the area. For this dataset, we set maxclust=15
. If the number of identified clusters exceeds maxclust
, a warning will encourage you to increase your set maxclust
.
The output of clusso()
is assigned to the object resclusso
. resclusso
is a large list of lists. The recommended way to explore the results is to explore the names(resreal)
and select each sub-list of interest.
rMax <- 100
Time <- 1
xx <- coords$X
yy <- coords$Y
resclusso <- clusso(df=widatsf,
expected = ntrials,
observed = ncases,
timeperiod = time,
id = NAME,
covars = FALSE,
x= xx,
y= yy,
rMax = rMax,
utm=FALSE,
analysis = "space",
model="binomial",
maxclust = 15, collapsetime = TRUE)
clussopretty(resclusso, analysis="space", model="binomial",clusteridentify=FALSE)
## model analysistype numclust.AIC numclust.AICc numclust.BIC
## 1 Binomial Space 17 17 17
## 2 Quasi-Binomial Space 16 16 13
clusso
Results on the MapWe set the max and min relative risk for the color scheme using maxrr
and minrr
. You may want to explore these values depending on the presentation you are using (a shortened maxrr, minrr distance will bring colors in closer and maps on a projector will appear more vivid and darker).
We extract the relative risk estimates using BIC from resclusso$lassoresult.p.s$E.qbic
and then use redblue()
to map the estimated relative risks to colors.
maxrr <- 10
minrr <- 0.1
estrr.bic <- resclusso$lassoresult.p.s$E.qbic
cluster_ix <- redblue(log(maxrr * pmax(minrr, pmin(estrr.bic, maxrr)))/log(maxrr^2))
table(cluster_ix)
## cluster_ix
## #063365 #124A86 #1D5FA2 #2166AC #266CAF #3681BA #3783BB #3885BB #408FC1 #4493C3
## 1 1 1 2 1 2 3 2 3 1
## #4594C4 #4795C4 #4996C5 #4D99C6 #4E99C6 #4E9AC6 #539DC8 #5DA3CC #62A7CD #67AACF
## 3 1 1 4 1 7 4 2 1 1
## #6EAED1 #7CB7D6 #86BDDA #87BEDA #8BC0DB #91C4DD #97C7DF #9FCBE1 #A8D0E4 #BDDBEA
## 1 2 1 3 5 4 1 3 1 1
## #F2F4F6 #F6F6F6 #F8BDA0
## 1 1 6
Next, we combine the estimated relative risks (resclusso$lassoresult.p.s$E.qbic
), county names (widatsf$NAME
), and mapped colors (cluster_ix
) into the dataframe rr.bic
. We map the county names onto the colors (names(cluster_ix) <- rr.bic$county
).
rr.bic <- cbind.data.frame(rr=resclusso$lassoresult.p.s$E.qbic,
county=widatsf$NAME,
colors = cluster_ix)
names(cluster_ix) <- rr.bic$county
We merge the dataframe rr.bic
to the SpatialPolygonsDataFrame
, wi
, by county name and convert it to a simple features dataframe for use with ggplot2
.
merged.bic<- sp::merge(wi,rr.bic, by.x="NAME", by.y="county", duplicateGeoms=TRUE)
datdf.bic <- st_as_sf(merged.bic)
Finally, we use geom_sf()
to plot the estimated relative risks onto the map. We use the aesthetic (aes(fill=NAME)
) and scale_fill_manual(values=cluster_ix)
to plot our pre-defined colors onto the map.
p1+ geom_sf(data=datdf.bic, aes(fill=NAME)) +
scale_colour_manual(values = c("red", "blue", "green")) +
scale_fill_manual(values=cluster_ix)+
theme(plot.title = element_text(hjust=0.5)) +
ggtitle("Binomial, Space: BIC")
For a simpler map, we can also isolate the counties with relative risks greater than 1 and only outline county edges with red:
ggplot()+ geom_sf(data=datdf.bic, lwd=0, fill='transparent') +
geom_sf(data=subset(datdf.bic, rr>1), color="red", size=2) +
theme_bw()+
theme(legend.position = "none",
plot.title = element_text(hjust=0.5)) +
ggtitle("Binomial, Space: BIC")