Code
<- get_acs(
lac_acs_2020 geography = "tract",
county = c("Los Angeles"),
state = "CA",
variable = c(total_pop = "B01003_001"),
year = 2020,
geometry = TRUE,
output = "wide"
%>%
) st_transform(4326)
R25 Modelers and Story Tellers
UCLA is located in the Los Angeles County (LA County). Let us use various data science tools in R (tidyverse, tidycensus) to explore the demographics of the LA County. It gives a preview of the Import
-> Tidy
-> Transform
-> Visualize
workflow that we’ll practice today and tomorrow. Model
and Communicate
will be the focus on Days 3 and 4.
This dynamic document was produced by Quarto, which supports R, Python, and Julia. Detailed R syntax will be covered in the next tutorials.
<- get_acs(
lac_acs_2020 geography = "tract",
county = c("Los Angeles"),
state = "CA",
variable = c(total_pop = "B01003_001"),
year = 2020,
geometry = TRUE,
output = "wide"
%>%
) st_transform(4326)
There are 2498 census tracts in LA County. On average each census tract contains 4000 residents. These define our communities and we want to visualize health, income, education, housing, and health disparities in these communities.
<- tibble(
ucla_map location = c("UCLA"),
longitude = c(-118.444990),
latitude = c(34.070963)
%>%
) st_as_sf(
coords = c("longitude", "latitude"),
crs = 4326
%>%
) st_transform(4326) %>%
mapview(
col.regions = "red",
legend = FALSE
)
mapview(lac_acs_2020, legend = FALSE) + ucla_map
# 2022 Decennial Census data
<- get_decennial(
lac_dec_2020 geography = "county",
state = "CA",
county = c("Los Angeles"),
variables = c(
total_pop = "P2_001N"
),year = 2020
)
# National population size from 2020 Decennial Census.
<- get_decennial(
uspop20 geography = "state",
variables = "P2_001N",
year = 2020
)
LA metro area has 10014009 residents, about 3% of the US population (2020 Decennial Census).
# 2021 1-year ACS data for LA County
<- get_acs(
lac_acs1_2021 geography = "county",
county = c("Los Angeles"),
state = "CA",
year = 2021,
variables = c(med_age = "B01002_001"),
survey = "acs1"
)
The median age is 37.8 in LA County (2021 ACS).
Population by Age and Sex (Pyramid Plot):
# ingest
<- get_estimates(
lac_pyramid geography = "county",
county = c("Los Angeles"),
state = "CA",
product = "characteristics",
breakdown = c("SEX", "AGEGROUP"),
breakdown_labels = TRUE,
year = 2019
%>%
) # wrangle
filter(
str_detect(AGEGROUP, "^Age"),
!= "Both sexes"
SEX %>%
) mutate(value = ifelse(SEX == "Male", -value, value)) %>%
# visualize
ggplot(aes(x = value, y = AGEGROUP, fill = SEX)) +
geom_col(width = 0.95, alpha = 0.75) +
theme_minimal(base_family = "Verdana", base_size = 12) +
scale_x_continuous(
labels = ~ number_format(scale = .001, suffix = "k")(abs(.x)),
limits = 1000000 * c(-0.5, 0.5)
+
) scale_y_discrete(labels = ~ str_remove_all(.x, "Age\\s|\\syears")) +
scale_fill_manual(values = c("darkred", "navy")) +
labs(x = "",
y = "2019 ACS estimate",
title = "Population structure in Los Angeles County",
fill = "",
caption = "Data source: US Census Bureau population estimates")
ggplotly(lac_pyramid)
# ingest
<- get_estimates(
senior_prop geography = "county",
county = c("Los Angeles"),
state = "CA",
product = "characteristics",
breakdown = c("SEX", "AGEGROUP"),
breakdown_labels = TRUE,
year = 2019
%>%
) # wrangle
filter(
str_detect(AGEGROUP, "^Age"),
!= "Both sexes"
SEX %>%
) mutate(AGEGROUP = as.ordered(AGEGROUP)) %>%
mutate(senior = AGEGROUP >= "Age 65 to 69 years") %>%
summarise(seniorprop = sum(senior * value) / sum(value))
14.1% of Angelenos are 65 years and older(2014-2019 ACS).
# Count of races in each tract from the 2020 decennial census
<- get_decennial(
lac_race geography = "tract",
state = "CA",
county = c("Los Angeles"),
variables = c(
Hispanic = "P2_002N",
White = "P2_005N",
Black = "P2_006N",
Native = "P2_007N",
Asian = "P2_008N"
),summary_var = "P2_001N",
year = 2020,
geometry = TRUE
%>%
) mutate(percent = 100 * (value / summary_value))
Percentages of Asian, Black, Hispanic, Native, and White populations in each census tract in LA County.
%>%
lac_race drop_na() %>%
tm_shape() +
tm_facets(by = "variable", scale.factor = 4) +
tm_fill(col = "percent",
style = "quantile",
n = 6,
palette = "Blues",
title = "Percent (2020 US Census)",) +
tm_layout(bg.color = "grey",
legend.position = c(-0.7, 0.15),
panel.label.bg.color = "white")
<- lac_race %>%
lac_dots drop_na() %>%
as_dot_density(
value = "value",
values_per_dot = 150,
group = "variable"
)
<- lac_race %>%
background_tracts drop_na() %>%
filter(variable == "White")
tm_shape(background_tracts) +
tm_polygons(col = "white",
border.col = "grey") +
tm_shape(lac_dots) +
tm_dots(col = "variable",
palette = "Set1",
size = 0.005,
title = "1 dot = 150 people") +
tm_layout(legend.outside = TRUE,
title = "Race/Ethnicity\n2020 Census")
The following table tallies the segregation indices \(H\) in major urban areas in California with population > 750,000. Higher \(H\) indicates more segregation.
library(segregation)
# Get California tract data by race/ethnicity
<- get_acs(
ca_acs_data geography = "tract",
variables = c(
white = "B03002_003",
black = "B03002_004",
asian = "B03002_006",
hispanic = "B03002_012"
), state = "CA",
geometry = TRUE,
year = 2019
)
# Use tidycensus to get urbanized areas by population with geometry,
# then filter for those that have populations of 750,000 or more
<- get_acs(
us_urban_areas geography = "urban area",
variables = "B01001_001",
geometry = TRUE,
year = 2019,
survey = "acs1"
%>%
) filter(estimate >= 750000) %>%
transmute(urban_name = str_remove(NAME,
fixed(", CA Urbanized Area (2010)")))
# Compute an inner spatial join between the California tracts and the
# urbanized areas, returning tracts in the largest California urban
# areas with the urban_name column appended
<- ca_acs_data %>%
ca_urban_data st_join(us_urban_areas, left = FALSE) %>%
select(-NAME) %>%
st_drop_geometry()
mutual_within(
data = ca_urban_data,
group = "variable",
unit = "GEOID",
weight = "estimate",
within = "urban_name",
wide = TRUE
%>%
) select(urban_name, H) %>%
arrange(desc(H))
urban_name H
1: Los Angeles--Long Beach--Anaheim 0.2851662
2: San Francisco--Oakland 0.2116127
3: San Diego 0.2025728
4: San Jose 0.1829190
5: Sacramento 0.1426804
6: Riverside--San Bernardino 0.1408461
Patterns of segregation across the most segregated urban area, Los Angeles:
<- ca_urban_data %>%
la_local_seg filter(urban_name == "Los Angeles--Long Beach--Anaheim") %>%
mutual_local(
group = "variable",
unit = "GEOID",
weight = "estimate",
wide = TRUE
)
<- tracts("CA", cb = TRUE, year = 2019) %>%
la_tracts_seg inner_join(la_local_seg, by = "GEOID")
%>%
la_tracts_seg ggplot(aes(fill = ls)) +
geom_sf(color = NA) +
coord_sf(crs = 26946) +
scale_fill_viridis_c(option = "inferno") +
theme_void() +
labs(fill = "Local\nsegregation index")
<- get_acs(
lac_lang geography = "county",
county = c("Los Angeles"),
state = "CA",
year = 2021,
variables = c(
English = "DP02_0113",
Spanish = "DP02_0116",
Indo_European = "DP02_0118",
Asian = "DP02_0120",
Other = "DP02_0122"
),summary_var = "DP02_0112",
survey = "acs5"
%>%
) mutate(percent = estimate / summary_est)
55.8% of Angelenos speak a language other than English at home (2016-2021 ACS).
<- lac_lang %>%
lac_lang_plot ggplot(aes(x = fct_rev(fct_reorder(variable, percent)), y = percent)) +
geom_col(color = "navy", fill = "navy",
alpha = 0.5, width = 0.4) +
scale_y_continuous(labels = label_percent(scale = 100)) +
labs(
title = "Languages spoken at home in LA County",
subtitle = "2016-2021 ACS, population 5 years and over",
x = "Language",
y = "Percent"
)
ggplotly(lac_lang_plot)
<- get_acs(
lac_foreignborn geography = "county",
county = c("Los Angeles"),
state = "CA",
year = 2021,
variables = c(
Total = "DP02_0105",
Europe = "DP02_0106",
Asia = "DP02_0107",
Africa = "DP02_0108",
Oceania = "DP02_0109",
Latin_America = "DP02_0110",
North_American = "DP02_0111"
),summary_var = "B01003_001",
survey = "acs5"
%>%
) mutate(percent = estimate / summary_est)
33.5% of Angelenos are foreign-born (2016-2021 ACS).
<- lac_foreignborn %>%
lac_foreignborn_plot filter(variable != "Total") %>%
ggplot(aes(x = fct_rev(fct_reorder(variable, percent)), y = percent)) +
geom_col(color = "navy", fill = "navy",
alpha = 0.5, width = 0.5) +
scale_y_continuous(labels = label_percent(scale = 100)) +
labs(
title = "Foreign born population in LA County",
subtitle = "2016-2021 ACS",
x = "Continent",
y = "Percent"
)
ggplotly(lac_foreignborn_plot)
# 2021 1-year ACS data for LA County
<- c(
acs1_variables total_population = "B01003_001",
med_age = "B01002_001",
med_house_val = "B25077_001",
med_rooms = "B25018_001",
med_year_built = "B25037_001",
med_income = "DP03_0062",
pct_college = "DP02_0068P",
pct_foreign_born = "DP02_0094P",
pct_insured = "DP03_0096P",
pct_ooh = "DP04_0046P",
pct_white = "DP05_0077P",
poverty_denom = "B17010_001",
poverty_num = "B17010_002",
pop_16above = "DP03_0001",
emp_16above = "DP03_0004"
)
<- get_acs(
lac_acs1_2021 geography = "county",
county = c("Los Angeles"),
state = "CA",
year = 2021,
variables = c(
pop_16above = "DP03_0001",
emp_16above = "DP03_0004"
), survey = "acs1"
)
The employment rate in population 16 years and over in LA County is 58.3% (2021 ACS).
<- get_acs(
lac_acs1_2021 geography = "county",
county = c("Los Angeles"),
state = "CA",
year = 2021,
variables = c(
med_income = "DP03_0062"
), survey = "acs1"
)
The median household income in LA County is $77456 (2021 ACS).
<- get_acs(
lac_poverty geography = "county",
state = "CA",
county = "Los Angeles",
year = 2021,
survey = "acs1",
variables = c(
total_poverty_0_99 = "B06012_002",
total_poverty_100_149 = "B06012_003",
total_poverty_150_above = "B06012_004"
) )
14.2% percentage of Angelenos live under the poverty line (2021 ACS)!
Poverty struck neighborhoods:
library(tidycensus)
library(ggiraph)
library(tidyverse)
library(patchwork)
library(scales)
<- get_acs(
lac_medincome geography = "tract",
variables = c(med_income = "DP03_0062"),
county = c("Los Angeles"),
state = "CA",
year = 2021,
survey = "acs5",
geometry = TRUE,
output = "wide"
)
<- lac_medincome %>%
medinc_top10 slice_min(med_incomeE, n = 10) %>%
mutate(NAME = str_remove(NAME, ", Los Angeles County, California")) %>%
mutate(NAME = str_remove(NAME, "Census"))
<- ggplot() +
lametro_map geom_sf(data = county_subdivisions(state = "CA", county = "Los Angeles", cb = TRUE) %>% filter(!GEOID %in% c("0603792360", "0603792140", "0603792110", "0603793090", "0603790015", "0603790810", "0603790810", "0603793730", "0603791730", "0603790730", "0603792400", "0603793510", "0603793200"))) +
geom_sf_interactive(
data = medinc_top10,
mapping = aes(fill = med_incomeE, data_id = GEOID)
+
) scale_fill_distiller(
palette = "Greens",
direction = 1,
guide = "none"
+
) theme_void()
<- ggplot(
lametro_plot data = medinc_top10,
mapping = aes(x = med_incomeE, y = reorder(NAME, med_incomeE), fill = med_incomeE)
+
) geom_errorbar(aes(xmin = med_incomeE - med_incomeM, xmax = med_incomeE + med_incomeM)) +
geom_point_interactive(
color = "black",
size = 4,
shape = 21,
aes(data_id = GEOID)
+
) scale_fill_distiller(
palette = "Greens",
direction = 1,
labels = label_dollar()
+
) scale_x_continuous(labels = label_dollar()) +
labs(title = "Household income by census tract in LA County",
subtitle = "2016-2021 American Community Survey",
y = "",
x = "ACS estimate (bars represent margin of error)",
fill = "ACS estimate") +
theme_minimal(base_size = 14)
girafe(ggobj = lametro_map + lametro_plot, width_svg = 10, height_svg = 5) %>%
girafe_options(opts_hover(css = "fill:red;"))
<- get_acs(
lac_edu geography = "county",
state = "CA",
county = "Los Angeles",
year = 2020,
variables = c(
less_hs = "B06009_002",
hs = "B06009_003",
as = "B06009_004",
col = "B06009_005",
grad = "B06009_006"
),summary_var = "B06009_001"
)
33.5% of the population aged 25 and up in LA County have a bachelor’s degree or higher (2016-2020 ACS).
<- get_acs(
lac_pctcollege geography = "tract",
variables = c(pct_college = "DP02_0068P"),
county = c("Los Angeles"),
state = "CA",
year = 2020,
survey = "acs5",
geometry = TRUE,
output = "wide"
)
mapview(lac_pctcollege, zcol = "pct_collegeE", layer = "% college") + ucla_map
<- get_acs(
lac_commute geography = "county",
county = "Los Angeles",
state = "CA",
variables = c(
Drove_alone = "DP03_0019P",
Carpool = "DP03_0020P",
Public_transportation = "DP03_0021P",
Walk = "DP03_0022P",
Other_means = "DP03_0023P",
Work_from_home = "DP03_0024P",
Mean_travel_time_to_work_in_minutes = "DP03_0025"
),year = 2021
)
<- lac_commute %>%
lac_commute_plot filter(!str_detect(variable, "Mean_travel_time")) %>%
ggplot() +
geom_col(
aes(x = fct_rev(fct_reorder(variable, estimate)), y = estimate / 100),
color = "navy",
fill = "navy",
alpha = 0.5,
width = 0.6
+
) scale_y_continuous(labels = scales::percent) +
labs(
title = "Commuting mode in LA County",
subtitle = "2016-2021 ACS",
x = NULL,
y = "Percent among workers 16 years and over"
)
ggplotly(lac_commute_plot)
An average Angeleno worker (16 years or over) spends 31.4 minutes commuting to work.
get_acs(
geography = "tract",
variables = c(Commute_Time_in_Minutes = "DP03_0025"),
county = c("Los Angeles"),
state = "CA",
year = 2021,
survey = "acs5",
geometry = TRUE,
output = "wide"
%>%
) # print() %>%
mapview(zcol = "Commute_Time_in_MinutesE", layer.name = "Minutes") + ucla_map
# PUMA in LA metro
<- pumas(state = "CA", cb = TRUE, year = 2019) %>%
lametro_pumas filter(str_detect(NAME10, "(Los Angeles County|Orange County)"))
# PUMS variable
<- c("PUMA", "GRPIP", "RAC1P",
hh_variables "HISP", "HHT")
# ingest PUMS data
# get_pums(
# variables = hh_variables,
# state = "CA",
# puma = lametro_pumas$PUMACE10,
# year = 2020,
# variables_filter = list(
# SPORDER = 1,
# TEN = 3
# ),
# recode = TRUE
# ) %>%
read_rds("lametro_hh_data.rds") %>%
# recode
mutate(
race_ethnicity = case_when(
!= "01" ~ "Hispanic",
HISP == "01" & RAC1P == "1" ~ "White",
HISP == "01" & RAC1P == "2" ~ "Black",
HISP TRUE ~ "Other"
),married = case_when(
== "1" ~ "Married",
HHT TRUE ~ "Not married"
)%>%
) # group-wise summary
filter(race_ethnicity != "Other") %>%
group_by(race_ethnicity, married, PUMA) %>%
summarize(
prop_above_40 = sum(WGTP[GRPIP >= 40]) / sum(WGTP)
%>%
) # join with PUMA
left_join(lametro_pumas, by = c("PUMA" = "PUMACE10")) %>%
# save as rds
write_rds(file = "lametro_data_for_map.rds") %>%
print()
library(tmap)
read_rds("lametro_data_for_map.rds") %>%
st_as_sf() %>%
tm_shape() +
tm_facets(by = c("race_ethnicity", "married"), scale.factor = 6) +
tm_fill(col = "prop_above_40",
style = "quantile",
n = 5,
palette = "Blues",
title = "Percent households") +
tm_layout(bg.color = "grey",
legend.outside = TRUE,
panel.label.bg.color = "white",
main.title = "Rent burdened-households in LA County\n2016-2020 ACS estimate (from PUMS data)")
We define a household to be rent-burdened when gross rent is 40 percent or more of household income.
library(survey)
library(srvyr)
read_rds("lametro_hh_replicate.rds") %>%
to_survey(type = "housing",
design = "rep_weights") %>%
filter(TEN == 3) %>%
mutate(
race_ethnicity = case_when(
!= "01" ~ "Hispanic",
HISP == "01" & RAC1P == "1" ~ "White",
HISP == "01" & RAC1P == "2" ~ "Black",
HISP TRUE ~ "Other"
),married = case_when(
== "1" ~ "Married",
HHT TRUE ~ "Not married"
),above_40 = GRPIP >= 40
%>%
) filter(race_ethnicity != "Other") %>%
group_by(race_ethnicity, married) %>%
summarize(
prop_above_40 = survey_mean(above_40)
%>%
) # MOE
mutate(prop_above_40_moe = prop_above_40_se * 1.645,
label = paste(race_ethnicity, married, sep = ", ")) %>%
ggplot(aes(
x = prop_above_40,
y = reorder(label, prop_above_40)
+
)) geom_errorbar(aes(xmin = prop_above_40 - prop_above_40_moe,
xmax = prop_above_40 + prop_above_40_moe)) +
geom_point(size = 3, color = "navy") +
labs(title = "Rent burdened-households in LA metro",
x = "2016-2020 ACS estimate (from PUMS data)",
y = "",
caption = "Rent-burdened defined when gross rent is 40 percent or more\nof household income. Error bars represent a 90 percent confidence level.") +
scale_x_continuous(labels = scales::percent) +
theme_grey(base_size = 12)
<- get_acs(
lac_inscov geography = "county",
variables = c(
total = "B27011_001",
in_labor_force_employed_insured = "B27011_004",
in_labor_force_employed_uninsured = "B27011_007",
in_labor_force_unemployed_insured = "B27011_009",
in_labor_force_unemployed_uninsured = "B27011_012",
in_labor_force_employed_insured = "B27011_004",
notin_labor_force_uninsured = "B27011_014",
notin_labor_force_insured = "B27011_017"
),county = "Los Angeles",
state = "CA",
year = 2021,
survey = "acs1",
# output = "wide"
)
12.5% of Angelenos don’t have insurance (2021 ACS).
<- get_acs(
lac_insured geography = "tract",
variables = c(pct_insured = "DP03_0096P"),
county = c("Los Angeles"),
state = "CA",
year = 2021,
geometry = TRUE,
output = "wide"
)
mapview(lac_insured, zcol = "pct_insuredE", layer = "% insured") + ucla_map
July 20:
8:10 am - 9 am: Preface: Communities in Los Angeles County.
9 am - 9:10 am: Break.
9:10 am - 10 am US Census data and R programming language.
10 am - 10:30 am: Break.
10:30 am - 11:30 am: tidycensus.
11:30 am - 11:40 am: Break.
11:40 am - 12:30 pm: Data wrangling using tidyverse.
12:30 pm - 1 pm: Break.
1 pm - 2 pm: Visualizing census data.
2 pm - 3 pm: Office hour (on Zoom).