library(tidycensus)
library(tidyverse)
library(readr)
library(car)
library(spdep)
library(tigris)
library(sf)
pov <- read_csv("../data/supplement_poverty.csv")
str(pov)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 3219 obs. of 9 variables:
## $ X1 : num 1 2 3 4 5 6 7 8 9 10 ...
## $ GEOID : chr "01001" "01003" "01005" "01007" ...
## $ STATE : chr "01" "01" "01" "01" ...
## $ NAME : chr "Autauga County" "Baldwin County" "Barbour County" "Bibb County" ...
## $ totpopn: num 43671 140415 29038 20826 51024 ...
## $ poverty: num 108.5 99.8 242.2 196.4 116.2 ...
## $ ag : num 10.3 8.3 14.6 21.5 13.7 ...
## $ manu : num 73.9 56.2 110.1 90.8 86.9 ...
## $ retail : num 57.6 63.7 36.9 38 51.6 ...
## - attr(*, "spec")=
## .. cols(
## .. X1 = col_double(),
## .. GEOID = col_character(),
## .. STATE = col_character(),
## .. NAME = col_character(),
## .. totpopn = col_double(),
## .. poverty = col_double(),
## .. ag = col_double(),
## .. manu = col_double(),
## .. retail = col_double()
## .. )
Without consideration for spatial dependence of the counties, let’s first explore a standard linear regression model:
m1 <- lm(poverty ~ ag + manu + retail, data=pov)
summary(m1)
##
## Call:
## lm(formula = poverty ~ ag + manu + retail, data = pov)
##
## Residuals:
## Min 1Q Median 3Q Max
## -306.85 -39.92 -6.68 31.62 427.28
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 419.02088 6.18055 67.80 <2e-16 ***
## ag -0.62626 0.03815 -16.42 <2e-16 ***
## manu -0.49124 0.03044 -16.14 <2e-16 ***
## retail -4.36040 0.10513 -41.47 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 68.02 on 3215 degrees of freedom
## Multiple R-squared: 0.3953, Adjusted R-squared: 0.3948
## F-statistic: 700.7 on 3 and 3215 DF, p-value: < 2.2e-16
Diagnostic plots:
par(mfrow=c(1,2))
plot(m1, which=c(1,2))
Evidence of heteroskedasticity based on residuals plot. We fill first explore transformations.
There are many various linear transformations one can take (log, logit, square root, arcsine). In this example, we will explore the logit transformation which is often used in demographic studies. The logit can be defined as:
\[logit(p) = log(\frac{p}{1-p})\]
m1_logit <- lm(car::logit(poverty) ~ ag + manu + retail, data = pov)
summary(m1_logit)
##
## Call:
## lm(formula = car::logit(poverty) ~ ag + manu + retail, data = pov)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.8684 -0.9068 -0.1888 0.6586 6.6363
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.9515724 0.2639597 7.393 2.94e-13 ***
## ag 0.0069977 0.0015480 4.520 6.88e-06 ***
## manu -0.0022029 0.0009441 -2.333 0.0198 *
## retail -0.0072726 0.0039935 -1.821 0.0689 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.299 on 1041 degrees of freedom
## (2174 observations deleted due to missingness)
## Multiple R-squared: 0.04127, Adjusted R-squared: 0.0385
## F-statistic: 14.94 on 3 and 1041 DF, p-value: 1.592e-09
par(mfrow=c(1,2))
plot(m1_logit, which=c(1,2))
After the transformation, we observe that while the residuals vs. fitted plot looks better, there still appears to be a pattern in the residuals, indicating heteroskedasticity. For the QQ plot, the majority of the observations align nicely along the 45 degree line. However we do observe a slight bowl shape, indicating right skewness in the distribution of the residuals.
m1_weight <- lm(poverty ~ ag + manu + retail, data = pov, weights = totpopn)
summary(m1_weight)
##
## Call:
## lm(formula = poverty ~ ag + manu + retail, data = pov, weights = totpopn)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -113795 -5923 -617 6300 125250
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 378.63017 5.42314 69.817 < 2e-16 ***
## ag 0.30450 0.06805 4.475 7.91e-06 ***
## manu -0.33238 0.02708 -12.276 < 2e-16 ***
## retail -4.36385 0.09880 -44.170 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15220 on 3215 degrees of freedom
## Multiple R-squared: 0.4379, Adjusted R-squared: 0.4373
## F-statistic: 834.8 on 3 and 3215 DF, p-value: < 2.2e-16
par(mfrow=c(1,2))
plot(m1_weight, which=c(1,2))
To demonstrate how these transformations will look like with spatial linear models, we will use the SAR model across the transformation and weighting.
First, prepare the spatial data:
uscounties <- counties(state = pov$STATE,
cb=TRUE,year=2000)
uscounties_sf <- st_as_sf(uscounties)
uscounties_sf$FIPS <- paste0(uscounties_sf$STATEFP, uscounties_sf$COUNTYFP)
#merge sf to pov
pov_uscounties_sf <- merge(uscounties_sf, pov, by.x="FIPS", by.y="GEOID")
povnb <- poly2nb(as_Spatial(pov_uscounties_sf))
listw_povW = nb2listw(povnb, style="W",zero.policy = TRUE)
m1_logit.sar <- spautolm(car::logit(poverty) ~ ag + manu + retail, data = pov,
family="SAR", listw = listw_povW, zero.policy = TRUE)
summary(m1_logit.sar)
##
## Call: spautolm(formula = car::logit(poverty) ~ ag + manu + retail,
## data = pov, listw = listw_povW, family = "SAR", zero.policy = TRUE)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.74457 -0.86637 -0.20888 0.62253 6.45728
##
## Regions with no neighbours included:
## 2 26 59 72 77 79 84 100 113 214 240 343 369 380 407 410 420 425 435 456 460 472 498 538 545 591 616 636 897 1162 1179 1213 1224 1294 1299 1320 1340 1415 1459 1502 1565 1567 1643 1666 1724 1808 1852 1868 1879 1915 1972 1992 2035 2101 2214 2216 2288 2320 2331 2345 2388 2405 2489 2568 2570 2623 2638 2652 2671 2690 2766 2863 2955 2980 3031
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.1164726 0.2721351 7.7773 7.327e-15
## ag 0.0053853 0.0017224 3.1266 0.001768
## manu -0.0021443 0.0010726 -1.9992 0.045586
## retail -0.0089858 0.0040374 -2.2256 0.026040
##
## Lambda: 0.23855 LR test value: 42.948 p-value: 5.6212e-11
## Numerical Hessian standard error of lambda: 0.035294
##
## Log likelihood: -1732.617
## ML residual variance (sigma squared): 1.5847, (sigma: 1.2588)
## Number of observations: 1045
## Number of parameters estimated: 6
## AIC: 3477.2
plot(fitted(m1_logit.sar),residuals(m1_logit.sar), cex=0.5,
xlab="Fitted Values", ylab="Residuals", main="Fitted Values vs. Residuals Plot")
abline(h=0, col="red", lty=2)
m1_weight.sar <- spautolm(poverty ~ ag + manu + retail, data = pov,
family="SAR", listw = listw_povW, zero.policy = TRUE,
weights = totpopn)
summary(m1_weight.sar)
##
## Call: spautolm(formula = poverty ~ ag + manu + retail, data = pov,
## listw = listw_povW, weights = totpopn, family = "SAR", zero.policy = TRUE)
##
## Residuals:
## Min 1Q Median 3Q Max
## -307.0172 -21.8133 3.2856 29.0158 415.9834
##
## Regions with no neighbours included:
## 69 544 545 547 1224 1868 2980 3166 3216
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 337.504518 5.142049 65.6362 < 2.2e-16
## ag -0.478023 0.063809 -7.4915 6.817e-14
## manu -0.420046 0.030183 -13.9168 < 2.2e-16
## retail -3.285576 0.083234 -39.4740 < 2.2e-16
##
## Lambda: 0.7097 LR test value: 1960.8 p-value: < 2.22e-16
## Numerical Hessian standard error of lambda: 0.011416
##
## Log likelihood: -18122.11
## ML residual variance (sigma squared): 111820000, (sigma: 10574)
## Number of observations: 3219
## Number of parameters estimated: 6
## AIC: 36256
plot(fitted(m1_weight.sar),residuals(m1_weight.sar), cex=0.5,
xlab="Fitted Values", ylab="Residuals", main="Fitted Values vs. Residuals Plot")
abline(h=0, col="red", lty=2)