Set-Up

library(tidycensus)
library(tidyverse)
library(readr)
library(car)
library(spdep)
library(tigris)
library(sf)
pov <- read_csv("../data/supplement_poverty.csv")
str(pov)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 3219 obs. of  9 variables:
##  $ X1     : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ GEOID  : chr  "01001" "01003" "01005" "01007" ...
##  $ STATE  : chr  "01" "01" "01" "01" ...
##  $ NAME   : chr  "Autauga County" "Baldwin County" "Barbour County" "Bibb County" ...
##  $ totpopn: num  43671 140415 29038 20826 51024 ...
##  $ poverty: num  108.5 99.8 242.2 196.4 116.2 ...
##  $ ag     : num  10.3 8.3 14.6 21.5 13.7 ...
##  $ manu   : num  73.9 56.2 110.1 90.8 86.9 ...
##  $ retail : num  57.6 63.7 36.9 38 51.6 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   X1 = col_double(),
##   ..   GEOID = col_character(),
##   ..   STATE = col_character(),
##   ..   NAME = col_character(),
##   ..   totpopn = col_double(),
##   ..   poverty = col_double(),
##   ..   ag = col_double(),
##   ..   manu = col_double(),
##   ..   retail = col_double()
##   .. )

Standard Linear Regression

Without consideration for spatial dependence of the counties, let’s first explore a standard linear regression model:

m1 <- lm(poverty ~ ag + manu + retail, data=pov)
summary(m1)
## 
## Call:
## lm(formula = poverty ~ ag + manu + retail, data = pov)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -306.85  -39.92   -6.68   31.62  427.28 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 419.02088    6.18055   67.80   <2e-16 ***
## ag           -0.62626    0.03815  -16.42   <2e-16 ***
## manu         -0.49124    0.03044  -16.14   <2e-16 ***
## retail       -4.36040    0.10513  -41.47   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 68.02 on 3215 degrees of freedom
## Multiple R-squared:  0.3953, Adjusted R-squared:  0.3948 
## F-statistic: 700.7 on 3 and 3215 DF,  p-value: < 2.2e-16

Diagnostic plots:

par(mfrow=c(1,2))
plot(m1, which=c(1,2))

Evidence of heteroskedasticity based on residuals plot. We fill first explore transformations.

Transformations

There are many various linear transformations one can take (log, logit, square root, arcsine). In this example, we will explore the logit transformation which is often used in demographic studies. The logit can be defined as:

\[logit(p) = log(\frac{p}{1-p})\]

m1_logit <- lm(car::logit(poverty) ~ ag + manu + retail, data = pov)
summary(m1_logit)
## 
## Call:
## lm(formula = car::logit(poverty) ~ ag + manu + retail, data = pov)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.8684 -0.9068 -0.1888  0.6586  6.6363 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.9515724  0.2639597   7.393 2.94e-13 ***
## ag           0.0069977  0.0015480   4.520 6.88e-06 ***
## manu        -0.0022029  0.0009441  -2.333   0.0198 *  
## retail      -0.0072726  0.0039935  -1.821   0.0689 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.299 on 1041 degrees of freedom
##   (2174 observations deleted due to missingness)
## Multiple R-squared:  0.04127,    Adjusted R-squared:  0.0385 
## F-statistic: 14.94 on 3 and 1041 DF,  p-value: 1.592e-09
par(mfrow=c(1,2))
plot(m1_logit, which=c(1,2))

After the transformation, we observe that while the residuals vs. fitted plot looks better, there still appears to be a pattern in the residuals, indicating heteroskedasticity. For the QQ plot, the majority of the observations align nicely along the 45 degree line. However we do observe a slight bowl shape, indicating right skewness in the distribution of the residuals.

Weighting

m1_weight <- lm(poverty ~ ag + manu + retail, data = pov, weights = totpopn)
summary(m1_weight)
## 
## Call:
## lm(formula = poverty ~ ag + manu + retail, data = pov, weights = totpopn)
## 
## Weighted Residuals:
##     Min      1Q  Median      3Q     Max 
## -113795   -5923    -617    6300  125250 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 378.63017    5.42314  69.817  < 2e-16 ***
## ag            0.30450    0.06805   4.475 7.91e-06 ***
## manu         -0.33238    0.02708 -12.276  < 2e-16 ***
## retail       -4.36385    0.09880 -44.170  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15220 on 3215 degrees of freedom
## Multiple R-squared:  0.4379, Adjusted R-squared:  0.4373 
## F-statistic: 834.8 on 3 and 3215 DF,  p-value: < 2.2e-16
par(mfrow=c(1,2))
plot(m1_weight, which=c(1,2))

Spatial Linear Models

To demonstrate how these transformations will look like with spatial linear models, we will use the SAR model across the transformation and weighting.

First, prepare the spatial data:

uscounties <- counties(state = pov$STATE,
                       cb=TRUE,year=2000)
uscounties_sf <- st_as_sf(uscounties)
uscounties_sf$FIPS <- paste0(uscounties_sf$STATEFP, uscounties_sf$COUNTYFP)
#merge sf to pov
pov_uscounties_sf <- merge(uscounties_sf, pov, by.x="FIPS", by.y="GEOID")
povnb <- poly2nb(as_Spatial(pov_uscounties_sf))
listw_povW = nb2listw(povnb, style="W",zero.policy = TRUE)

Transformations

m1_logit.sar <- spautolm(car::logit(poverty) ~ ag + manu + retail, data = pov, 
                         family="SAR", listw = listw_povW, zero.policy = TRUE)
summary(m1_logit.sar)
## 
## Call: spautolm(formula = car::logit(poverty) ~ ag + manu + retail, 
##     data = pov, listw = listw_povW, family = "SAR", zero.policy = TRUE)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -6.74457 -0.86637 -0.20888  0.62253  6.45728 
## 
## Regions with no neighbours included:
##  2 26 59 72 77 79 84 100 113 214 240 343 369 380 407 410 420 425 435 456 460 472 498 538 545 591 616 636 897 1162 1179 1213 1224 1294 1299 1320 1340 1415 1459 1502 1565 1567 1643 1666 1724 1808 1852 1868 1879 1915 1972 1992 2035 2101 2214 2216 2288 2320 2331 2345 2388 2405 2489 2568 2570 2623 2638 2652 2671 2690 2766 2863 2955 2980 3031 
## 
## Coefficients: 
##               Estimate Std. Error z value  Pr(>|z|)
## (Intercept)  2.1164726  0.2721351  7.7773 7.327e-15
## ag           0.0053853  0.0017224  3.1266  0.001768
## manu        -0.0021443  0.0010726 -1.9992  0.045586
## retail      -0.0089858  0.0040374 -2.2256  0.026040
## 
## Lambda: 0.23855 LR test value: 42.948 p-value: 5.6212e-11 
## Numerical Hessian standard error of lambda: 0.035294 
## 
## Log likelihood: -1732.617 
## ML residual variance (sigma squared): 1.5847, (sigma: 1.2588)
## Number of observations: 1045 
## Number of parameters estimated: 6 
## AIC: 3477.2
plot(fitted(m1_logit.sar),residuals(m1_logit.sar), cex=0.5,
     xlab="Fitted Values", ylab="Residuals", main="Fitted Values vs. Residuals Plot")
abline(h=0, col="red", lty=2)
Fitted values versus residuals plot for logit transformed SAR model

Fitted values versus residuals plot for logit transformed SAR model

Weighting

m1_weight.sar <- spautolm(poverty ~ ag + manu + retail, data = pov, 
                         family="SAR", listw = listw_povW, zero.policy = TRUE,
                         weights = totpopn)
summary(m1_weight.sar)
## 
## Call: spautolm(formula = poverty ~ ag + manu + retail, data = pov, 
##     listw = listw_povW, weights = totpopn, family = "SAR", zero.policy = TRUE)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -307.0172  -21.8133    3.2856   29.0158  415.9834 
## 
## Regions with no neighbours included:
##  69 544 545 547 1224 1868 2980 3166 3216 
## 
## Coefficients: 
##               Estimate Std. Error  z value  Pr(>|z|)
## (Intercept) 337.504518   5.142049  65.6362 < 2.2e-16
## ag           -0.478023   0.063809  -7.4915 6.817e-14
## manu         -0.420046   0.030183 -13.9168 < 2.2e-16
## retail       -3.285576   0.083234 -39.4740 < 2.2e-16
## 
## Lambda: 0.7097 LR test value: 1960.8 p-value: < 2.22e-16 
## Numerical Hessian standard error of lambda: 0.011416 
## 
## Log likelihood: -18122.11 
## ML residual variance (sigma squared): 111820000, (sigma: 10574)
## Number of observations: 3219 
## Number of parameters estimated: 6 
## AIC: 36256
plot(fitted(m1_weight.sar),residuals(m1_weight.sar), cex=0.5,
     xlab="Fitted Values", ylab="Residuals", main="Fitted Values vs. Residuals Plot")
abline(h=0, col="red", lty=2)
Fitted values versus residuals plot for population-weighted SAR model

Fitted values versus residuals plot for population-weighted SAR model