Communities in LA County: What Do We Know?

R25 Modelers and Story Tellers

Author

Drs. Hua Zhou and Roch Nianogo

Published

July 20, 2023

1 Introduction

UCLA is located in the Los Angeles County (LA County). Let us use various data science tools in R (tidyverse, tidycensus) to explore the demographics of the LA County. It gives a preview of the Import -> Tidy -> Transform -> Visualize workflow that we’ll practice today and tomorrow. Model and Communicate will be the focus on Days 3 and 4.

This dynamic document was produced by Quarto, which supports R, Python, and Julia. Detailed R syntax will be covered in the next tutorials.

Code

lac_acs_2020 <- get_acs(
  geography = "tract",
  county = c("Los Angeles"),
  state = "CA",
  variable = c(total_pop = "B01003_001"),
  year = 2020,
  geometry = TRUE,
  output = "wide"
) %>%
  st_transform(4326)

There are 2498 census tracts in LA County. On average each census tract contains 4000 residents. These define our communities and we want to visualize health, income, education, housing, and health disparities in these communities.

Code

ucla_map <- tibble(
  location = c("UCLA"), 
  longitude = c(-118.444990),
  latitude = c(34.070963)
  ) %>%
  st_as_sf(
    coords = c("longitude", "latitude"),
    crs = 4326
    ) %>%
  st_transform(4326) %>%
  mapview(
    col.regions = "red",
    legend = FALSE
    )

mapview(lac_acs_2020, legend = FALSE) + ucla_map

2 A quiz about LA County

https://huazhou.shinyapps.io/R25-2023-00-la-quiz/

3 Population and people

Code

# 2022 Decennial Census data
lac_dec_2020 <- get_decennial(
  geography = "county",
  state = "CA",
  county = c("Los Angeles"),
  variables = c(
    total_pop = "P2_001N"
  ),
  year = 2020
)

# National population size from 2020 Decennial Census.
uspop20 <- get_decennial(
  geography = "state", 
  variables = "P2_001N",
  year = 2020
)

LA metro area has 10014009 residents, about 3% of the US population (2020 Decennial Census).

3.1 Age and sex

Code

# 2021 1-year ACS data for LA County
lac_acs1_2021 <- get_acs(
  geography = "county",
  county = c("Los Angeles"),
  state = "CA",
  year = 2021,
  variables = c(med_age = "B01002_001"), 
  survey = "acs1"
)

The median age is 37.8 in LA County (2021 ACS).

Population by Age and Sex (Pyramid Plot):

Code

# ingest
lac_pyramid <- get_estimates(
  geography = "county",
  county = c("Los Angeles"),
  state = "CA",
  product = "characteristics",
  breakdown = c("SEX", "AGEGROUP"),
  breakdown_labels = TRUE,
  year = 2019
) %>% 
  # wrangle
  filter(
    str_detect(AGEGROUP, "^Age"),
    SEX != "Both sexes"
    ) %>%
  mutate(value = ifelse(SEX == "Male", -value, value)) %>%
  # visualize
  ggplot(aes(x = value, y = AGEGROUP, fill = SEX)) +
  geom_col(width = 0.95, alpha = 0.75) +
  theme_minimal(base_family = "Verdana", base_size = 12) +
  scale_x_continuous(
    labels = ~ number_format(scale = .001, suffix = "k")(abs(.x)),
    limits = 1000000 * c(-0.5, 0.5)
  ) +
  scale_y_discrete(labels = ~ str_remove_all(.x, "Age\\s|\\syears")) +
  scale_fill_manual(values = c("darkred", "navy")) +
  labs(x = "",
       y = "2019 ACS estimate",
       title = "Population structure in Los Angeles County",
       fill = "",
       caption = "Data source: US Census Bureau population estimates")

ggplotly(lac_pyramid)

Code

# ingest
senior_prop <- get_estimates(
    geography = "county",
    county = c("Los Angeles"),
    state = "CA",
    product = "characteristics",
    breakdown = c("SEX", "AGEGROUP"),
    breakdown_labels = TRUE,
    year = 2019
) %>% 
  # wrangle
  filter(
    str_detect(AGEGROUP, "^Age"),
    SEX != "Both sexes"
) %>% 
  mutate(AGEGROUP = as.ordered(AGEGROUP)) %>%
  mutate(senior = AGEGROUP >= "Age 65 to 69 years") %>%
  summarise(seniorprop = sum(senior * value) / sum(value))

14.1% of Angelenos are 65 years and older（2014-2019 ACS).

3.2 Race and ethnicity

Code

# Count of races in each tract from the 2020 decennial census
lac_race <- get_decennial(
  geography = "tract",
  state = "CA",
  county = c("Los Angeles"),
  variables = c(
    Hispanic = "P2_002N",
    White = "P2_005N",
    Black = "P2_006N",
    Native = "P2_007N",
    Asian = "P2_008N"
  ),
  summary_var = "P2_001N",
  year = 2020,
  geometry = TRUE
) %>%
  mutate(percent = 100 * (value / summary_value))

3.2.1 Faceted maps

Percentages of Asian, Black, Hispanic, Native, and White populations in each census tract in LA County.

Code

lac_race %>%
  drop_na() %>%
  tm_shape() +
  tm_facets(by = "variable", scale.factor = 4) +
  tm_fill(col = "percent",
          style = "quantile",
          n = 6,
          palette = "Blues",
          title = "Percent (2020 US Census)",) +
  tm_layout(bg.color = "grey",
            legend.position = c(-0.7, 0.15),
            panel.label.bg.color = "white")

3.2.2 Dot-density map

Code

lac_dots <- lac_race %>%
  drop_na() %>%
  as_dot_density(
    value = "value",
    values_per_dot = 150,
    group = "variable"
  )

background_tracts <- lac_race %>%
  drop_na() %>%
  filter(variable == "White")

tm_shape(background_tracts) +
  tm_polygons(col = "white",
              border.col = "grey") +
  tm_shape(lac_dots) +
  tm_dots(col = "variable",
          palette = "Set1",
          size = 0.005,
          title = "1 dot = 150 people") +
  tm_layout(legend.outside = TRUE,
            title = "Race/Ethnicity\n2020 Census")

3.3 Segregation and diversity

The following table tallies the segregation indices $H$ in major urban areas in California with population > 750,000. Higher $H$ indicates more segregation.

Code

library(segregation)

# Get California tract data by race/ethnicity
ca_acs_data <- get_acs(
  geography = "tract",
  variables = c(
    white = "B03002_003",
    black = "B03002_004",
    asian = "B03002_006",
    hispanic = "B03002_012"
  ), 
  state = "CA",
  geometry = TRUE,
  year = 2019
) 

# Use tidycensus to get urbanized areas by population with geometry, 
# then filter for those that have populations of 750,000 or more
us_urban_areas <- get_acs(
  geography = "urban area",
  variables = "B01001_001",
  geometry = TRUE,
  year = 2019,
  survey = "acs1"
) %>%
  filter(estimate >= 750000) %>%
  transmute(urban_name = str_remove(NAME, 
                                    fixed(", CA Urbanized Area (2010)")))

# Compute an inner spatial join between the California tracts and the 
# urbanized areas, returning tracts in the largest California urban 
# areas with the urban_name column appended
ca_urban_data <- ca_acs_data %>%
  st_join(us_urban_areas, left = FALSE) %>%
  select(-NAME) %>%
  st_drop_geometry()

mutual_within(
  data = ca_urban_data,
  group = "variable",
  unit = "GEOID",
  weight = "estimate",
  within = "urban_name",
  wide = TRUE
) %>%
  select(urban_name, H) %>%
  arrange(desc(H))

                         urban_name         H
1: Los Angeles--Long Beach--Anaheim 0.2851662
2:           San Francisco--Oakland 0.2116127
3:                        San Diego 0.2025728
4:                         San Jose 0.1829190
5:                       Sacramento 0.1426804
6:        Riverside--San Bernardino 0.1408461

3.3.1 Local segregation analysis

Patterns of segregation across the most segregated urban area, Los Angeles:

Code

la_local_seg <- ca_urban_data %>%
  filter(urban_name == "Los Angeles--Long Beach--Anaheim") %>%
  mutual_local(
    group = "variable",
    unit = "GEOID",
    weight = "estimate", 
    wide = TRUE
  )

la_tracts_seg <- tracts("CA", cb = TRUE, year = 2019) %>%
  inner_join(la_local_seg, by = "GEOID") 

la_tracts_seg %>%
  ggplot(aes(fill = ls)) + 
  geom_sf(color = NA) + 
  coord_sf(crs = 26946) + 
  scale_fill_viridis_c(option = "inferno") + 
  theme_void() + 
  labs(fill = "Local\nsegregation index")

3.4 Language spoken at home

Code

lac_lang <- get_acs(
  geography = "county",
  county = c("Los Angeles"),
  state = "CA",
  year = 2021,
  variables = c(
    English = "DP02_0113",
    Spanish = "DP02_0116",
    Indo_European = "DP02_0118",
    Asian = "DP02_0120",
    Other = "DP02_0122"
  ),
  summary_var = "DP02_0112",
  survey = "acs5"
  ) %>%
  mutate(percent = estimate / summary_est)

55.8% of Angelenos speak a language other than English at home (2016-2021 ACS).

Code

lac_lang_plot <- lac_lang %>%
  ggplot(aes(x = fct_rev(fct_reorder(variable, percent)), y = percent)) + 
  geom_col(color = "navy", fill = "navy", 
           alpha = 0.5, width = 0.4) + 
  scale_y_continuous(labels = label_percent(scale = 100)) +
  labs(
    title = "Languages spoken at home in LA County",
    subtitle = "2016-2021 ACS, population 5 years and over",
    x = "Language",
    y = "Percent"
  )

ggplotly(lac_lang_plot)

3.5 Native and foreign born

Code

lac_foreignborn <- get_acs(
  geography = "county",
  county = c("Los Angeles"),
  state = "CA",
  year = 2021,
  variables = c(
    Total = "DP02_0105",
    Europe = "DP02_0106",
    Asia = "DP02_0107",
    Africa = "DP02_0108",
    Oceania = "DP02_0109",
    Latin_America = "DP02_0110",
    North_American = "DP02_0111"
  ),
  summary_var = "B01003_001",
  survey = "acs5"
  ) %>%
  mutate(percent = estimate / summary_est)

33.5% of Angelenos are foreign-born (2016-2021 ACS).

Code

lac_foreignborn_plot <- lac_foreignborn %>%
  filter(variable != "Total") %>%
  ggplot(aes(x = fct_rev(fct_reorder(variable, percent)), y = percent)) + 
  geom_col(color = "navy", fill = "navy", 
           alpha = 0.5, width = 0.5) + 
  scale_y_continuous(labels = label_percent(scale = 100)) +
  labs(
    title = "Foreign born population in LA County",
    subtitle = "2016-2021 ACS",
    x = "Continent",
    y = "Percent"
  )

ggplotly(lac_foreignborn_plot)

4 Employment

Code

# 2021 1-year ACS data for LA County
acs1_variables <- c(
    total_population = "B01003_001",
    med_age = "B01002_001",
    med_house_val = "B25077_001",
    med_rooms = "B25018_001",
    med_year_built = "B25037_001",
    med_income = "DP03_0062",
    pct_college = "DP02_0068P",
    pct_foreign_born = "DP02_0094P",
    pct_insured = "DP03_0096P",
    pct_ooh = "DP04_0046P",
    pct_white = "DP05_0077P",
    poverty_denom = "B17010_001",
    poverty_num = "B17010_002",
    pop_16above = "DP03_0001",
    emp_16above = "DP03_0004"
    )

lac_acs1_2021 <- get_acs(
  geography = "county",
  county = c("Los Angeles"),
  state = "CA",
  year = 2021,
  variables = c(
    pop_16above = "DP03_0001",
    emp_16above = "DP03_0004"
  ), 
  survey = "acs1"
  )

The employment rate in population 16 years and over in LA County is 58.3% (2021 ACS).

5 Income and poverty

Code

lac_acs1_2021 <- get_acs(
  geography = "county",
  county = c("Los Angeles"),
  state = "CA",
  year = 2021,
  variables = c(
    med_income = "DP03_0062"
  ), 
  survey = "acs1"
  )

The median household income in LA County is $77456 (2021 ACS).

Code

lac_poverty <- get_acs(
  geography = "county",
  state = "CA",
  county = "Los Angeles",
  year = 2021,
  survey = "acs1",
  variables = c(
    total_poverty_0_99 = "B06012_002",
    total_poverty_100_149 = "B06012_003",
    total_poverty_150_above = "B06012_004"
  )
)

14.2% percentage of Angelenos live under the poverty line (2021 ACS)!

Poverty struck neighborhoods:

Code

library(tidycensus)
library(ggiraph)
library(tidyverse)
library(patchwork)
library(scales)

lac_medincome <- get_acs(
  geography = "tract",
  variables = c(med_income = "DP03_0062"),
  county = c("Los Angeles"),
  state = "CA",
  year = 2021,
  survey = "acs5",
  geometry = TRUE,
  output = "wide"
)

Code

medinc_top10 <- lac_medincome %>%
  slice_min(med_incomeE, n = 10) %>%
  mutate(NAME = str_remove(NAME, ", Los Angeles County, California")) %>%
  mutate(NAME = str_remove(NAME, "Census"))

lametro_map <- ggplot() +
  geom_sf(data = county_subdivisions(state = "CA", county = "Los Angeles", cb = TRUE) %>% filter(!GEOID %in% c("0603792360", "0603792140", "0603792110", "0603793090", "0603790015", "0603790810", "0603790810", "0603793730", "0603791730", "0603790730", "0603792400", "0603793510", "0603793200"))) +
  geom_sf_interactive(
    data = medinc_top10, 
    mapping = aes(fill = med_incomeE, data_id = GEOID)
    ) +
  scale_fill_distiller(
    palette = "Greens",
    direction = 1,
    guide = "none"
    ) +
  theme_void()

lametro_plot <- ggplot(
  data = medinc_top10, 
  mapping = aes(x = med_incomeE, y = reorder(NAME, med_incomeE), fill = med_incomeE)
  ) +
  geom_errorbar(aes(xmin = med_incomeE - med_incomeM, xmax = med_incomeE + med_incomeM)) +
  geom_point_interactive(
    color = "black", 
    size = 4, 
    shape = 21,
    aes(data_id = GEOID)
    ) +
  scale_fill_distiller(
    palette = "Greens", 
    direction = 1,
    labels = label_dollar()
    ) +
  scale_x_continuous(labels = label_dollar()) +
  labs(title = "Household income by census tract in LA County",
       subtitle = "2016-2021 American Community Survey",
       y = "",
       x = "ACS estimate (bars represent margin of error)",
       fill = "ACS estimate") +
  theme_minimal(base_size = 14)

girafe(ggobj = lametro_map + lametro_plot, width_svg = 10, height_svg = 5) %>%
  girafe_options(opts_hover(css = "fill:red;"))

6 Education

Code

lac_edu <- get_acs(
  geography = "county", 
  state = "CA", 
  county = "Los Angeles",
  year = 2020,
  variables = c(
    less_hs = "B06009_002",
    hs = "B06009_003",
    as = "B06009_004",
    col = "B06009_005",
    grad = "B06009_006"
    ),
  summary_var = "B06009_001"
)

33.5% of the population aged 25 and up in LA County have a bachelor’s degree or higher (2016-2020 ACS).

Code

lac_pctcollege <- get_acs(
  geography = "tract",
  variables = c(pct_college = "DP02_0068P"),
  county = c("Los Angeles"),
  state = "CA",
  year = 2020,
  survey = "acs5",
  geometry = TRUE,
  output = "wide"
)

Code

mapview(lac_pctcollege, zcol = "pct_collegeE", layer = "% college") + ucla_map

7 Commuting

Code

lac_commute <- get_acs(
  geography = "county",
  county = "Los Angeles",
  state = "CA",
  variables = c(
    Drove_alone = "DP03_0019P",
    Carpool = "DP03_0020P",
    Public_transportation = "DP03_0021P",
    Walk = "DP03_0022P",
    Other_means = "DP03_0023P",
    Work_from_home = "DP03_0024P",
    Mean_travel_time_to_work_in_minutes = "DP03_0025"
  ),
  year = 2021
)

Code

lac_commute_plot <- lac_commute %>%
  filter(!str_detect(variable, "Mean_travel_time")) %>%
  ggplot() + 
  geom_col(
    aes(x = fct_rev(fct_reorder(variable, estimate)), y = estimate / 100),
    color = "navy", 
    fill = "navy", 
    alpha = 0.5,
    width = 0.6
    ) + 
  scale_y_continuous(labels = scales::percent) +
  labs(
    title = "Commuting mode in LA County",
    subtitle = "2016-2021 ACS",
    x = NULL,
    y = "Percent among workers 16 years and over"
  )

ggplotly(lac_commute_plot)

An average Angeleno worker (16 years or over) spends 31.4 minutes commuting to work.

Code

get_acs(
  geography = "tract",
  variables = c(Commute_Time_in_Minutes = "DP03_0025"),
  county = c("Los Angeles"),
  state = "CA",
  year = 2021,
  survey = "acs5",
  geometry = TRUE,
  output = "wide"
) %>%
  # print() %>%
  mapview(zcol = "Commute_Time_in_MinutesE", layer.name = "Minutes") + ucla_map

8 Housing (rent burden)

Code

# PUMA in LA metro
lametro_pumas <- pumas(state = "CA", cb = TRUE, year = 2019) %>%
  filter(str_detect(NAME10, "(Los Angeles County|Orange County)"))

# PUMS variable
hh_variables <- c("PUMA", "GRPIP", "RAC1P",
                  "HISP", "HHT")

# ingest PUMS data
# get_pums(
#   variables = hh_variables,
#   state = "CA",
#   puma = lametro_pumas$PUMACE10,
#   year = 2020,
#   variables_filter = list(
#     SPORDER = 1,
#     TEN = 3
#   ),
#   recode = TRUE
# ) %>%
read_rds("lametro_hh_data.rds") %>%
  # recode
  mutate(
    race_ethnicity = case_when(
      HISP != "01" ~ "Hispanic",
      HISP == "01" & RAC1P == "1" ~ "White",
      HISP == "01" & RAC1P == "2" ~ "Black",
      TRUE ~ "Other"
    ),
    married = case_when(
      HHT == "1" ~ "Married",
      TRUE ~ "Not married"
    )
  ) %>%
  # group-wise summary
  filter(race_ethnicity != "Other") %>%
  group_by(race_ethnicity, married, PUMA) %>%
  summarize(
    prop_above_40 = sum(WGTP[GRPIP >= 40]) / sum(WGTP)
  ) %>%
  # join with PUMA
  left_join(lametro_pumas, by = c("PUMA" = "PUMACE10")) %>%
  # save as rds
  write_rds(file = "lametro_data_for_map.rds") %>%
  print()

Code

library(tmap)

read_rds("lametro_data_for_map.rds") %>%
  st_as_sf() %>%
  tm_shape() +
  tm_facets(by = c("race_ethnicity", "married"), scale.factor = 6) +
  tm_fill(col = "prop_above_40",
          style = "quantile",
          n = 5,
          palette = "Blues",
          title = "Percent households") +
  tm_layout(bg.color = "grey",
            legend.outside = TRUE,
            panel.label.bg.color = "white",
            main.title = "Rent burdened-households in LA County\n2016-2020 ACS estimate (from PUMS data)")

We define a household to be rent-burdened when gross rent is 40 percent or more of household income.

Code

library(survey)
library(srvyr)

read_rds("lametro_hh_replicate.rds") %>%
  to_survey(type = "housing",
            design = "rep_weights") %>%
  filter(TEN == 3) %>%
  mutate(
    race_ethnicity = case_when(
      HISP != "01" ~ "Hispanic",
      HISP == "01" & RAC1P == "1" ~ "White",
      HISP == "01" & RAC1P == "2" ~ "Black",
      TRUE ~ "Other"
    ),
    married = case_when(
      HHT == "1" ~ "Married",
      TRUE ~ "Not married"
    ),
    above_40 = GRPIP >= 40
  ) %>%
  filter(race_ethnicity != "Other") %>%
  group_by(race_ethnicity, married) %>%
  summarize(
    prop_above_40 = survey_mean(above_40)
  ) %>%
  # MOE
  mutate(prop_above_40_moe = prop_above_40_se * 1.645,
         label = paste(race_ethnicity, married, sep = ", "))  %>%
  ggplot(aes(
    x = prop_above_40,
    y = reorder(label, prop_above_40)
    )) +
  geom_errorbar(aes(xmin = prop_above_40 - prop_above_40_moe,
                     xmax = prop_above_40 + prop_above_40_moe)) +
  geom_point(size = 3, color = "navy") +
  labs(title = "Rent burdened-households in LA metro",
       x = "2016-2020 ACS estimate (from PUMS data)",
       y = "",
       caption = "Rent-burdened defined when gross rent is 40 percent or more\nof household income. Error bars represent a 90 percent confidence level.") +
  scale_x_continuous(labels = scales::percent) +
  theme_grey(base_size = 12)

9 Health

9.1 Insurance coverage and disparity

Code

lac_inscov <- get_acs(
  geography = "county",
  variables = c(
    total = "B27011_001",
    in_labor_force_employed_insured = "B27011_004",
    in_labor_force_employed_uninsured = "B27011_007",
    in_labor_force_unemployed_insured = "B27011_009",
    in_labor_force_unemployed_uninsured = "B27011_012",
    in_labor_force_employed_insured = "B27011_004",
    notin_labor_force_uninsured = "B27011_014",
    notin_labor_force_insured = "B27011_017"
    ),
  county = "Los Angeles",
  state = "CA",
  year = 2021,
  survey = "acs1",
  # output = "wide"
)

12.5% of Angelenos don’t have insurance (2021 ACS).

Code

lac_insured <- get_acs(
  geography = "tract",
  variables = c(pct_insured = "DP03_0096P"),
  county = c("Los Angeles"),
  state = "CA",
  year = 2021,
  geometry = TRUE,
  output = "wide"
)

Code

mapview(lac_insured, zcol = "pct_insuredE", layer = "% insured") + ucla_map

10 Roadmap

July 20:

8:10 am - 9 am: Preface: Communities in Los Angeles County.
9 am - 9:10 am: Break.
9:10 am - 10 am US Census data and R programming language.
10 am - 10:30 am: Break.
10:30 am - 11:30 am: tidycensus.
11:30 am - 11:40 am: Break.
11:40 am - 12:30 pm: Data wrangling using tidyverse.
12:30 pm - 1 pm: Break.
1 pm - 2 pm: Visualizing census data.
2 pm - 3 pm: Office hour (on Zoom).