Data Analysis

Set up

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(maps)

Warning: package 'maps' was built under R version 4.4.2


Attaching package: 'maps'

The following object is masked from 'package:purrr':

    map

library(usmap)

Warning: package 'usmap' was built under R version 4.4.2

library(plotly)


Attaching package: 'plotly'

The following object is masked from 'package:ggplot2':

    last_plot

The following object is masked from 'package:stats':

    filter

The following object is masked from 'package:graphics':

    layout

Load data

df_time<- readRDS("data/df_time.rds")
df_cleaned<- readRDS("data/df_cleaned.rds")
df_gunlaw <-readRDS("data/df_gunlaw.rds")
df_pop <-readRDS("data/df_pop.rds")

Q1. Time pattern of mass shootings

by month

df_time |>
  group_by(month)|>
  summarize(month_count = n())|>
  ggplot(aes(x = month, y = month_count, fill = month_count))+
  geom_col()+
  geom_text(aes(label = month_count), vjust = -0.4) +
  labs(title = "Which month sees the highest number of mass shooting incidents?",
       x = NULL,
       y = "Number of Incidents" ,
       legend = NULL)+
  theme_light()+
  theme(plot.title = element_text(hjust=.5, vjust = 3))+
  theme(legend.position = "none")

ggsave("out/shooting_month.png", width = 8, height = 6, dpi = 300)

by day of the week

weekday_names <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", 
                 "Sunday")
df_time <-df_time  |>
  mutate(en_weekday = factor(en_weekday, levels = weekday_names))

df_time |>
  group_by(en_weekday)|>
  summarize(day_count=n())|>
  ggplot(aes(x = en_weekday, y = day_count, fill = day_count))+
  geom_col()+
  geom_text(aes(label = day_count), vjust = -0.5) +
  labs(title = "On which day of the week do mass shootings occur most frequently?",
       x = NULL,
       y = "Number of Incidents" )+
  theme_light()+
  theme(plot.title = element_text(hjust=0.5))+
  theme(legend.position = "none")

ggsave("out/shooting_weekday.png", width = 6, height = 6, dpi = 300)

Which day sees the highest number of occurence?

df_time |>
  group_by(date)|>
  summarize(date_sum = n())|>
  arrange(-date_sum)

# A tibble: 188 × 2
   date            date_sum
   <chr>              <int>
 1 March 31, 2024         9
 2 July 28, 2024          8
 3 July 4, 2024           8
 4 July 5, 2024           8
 5 June 23, 2024          8
 6 May 18, 2024           7
 7 August 25, 2024        6
 8 August 4, 2024         6
 9 June 15, 2024          6
10 June 2, 2024           6
# ℹ 178 more rows

Which day sees the highest number of casualties

df_time |>
  group_by(date)|>
  summarize(cas_sum = sum(casualty))|>
  arrange(-cas_sum)

# A tibble: 188 × 2
   date               cas_sum
   <chr>                <dbl>
 1 June 2, 2024            59
 2 July 4, 2024            56
 3 June 23, 2024           47
 4 June 15, 2024           45
 5 September 21, 2024      43
 6 July 5, 2024            42
 7 March 31, 2024          42
 8 July 28, 2024           39
 9 May 18, 2024            39
10 October 12, 2024        38
# ℹ 178 more rows

Q2. Regional patterns of mass shootings

What’s the total number of mass shootings by states?

df_time |>
  select(id,date,state, location, casualty)|>
  group_by(state)|>
  summarise(occurence = n())|>
  arrange(desc(occurence))|>
  ggplot(aes(x = fct_reorder(state,occurence), y = occurence, fill = occurence))+
  geom_col()+
  coord_flip()+
  labs(title = "Mass Shootings by States",
       subtitle = "In 2024 (till Oct.20 )",
       x = NULL,
       y= "Number of Incidents")+
  
  theme_bw()+
  theme(legend.position = "none")+
  theme(plot.title = element_text(hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5))+
   theme(axis.text.y = element_text(size = 6.5))

Q3. (1)What’s the total casualties (defined as individuals injured or killed) by city or county?

df_cleaned |>
  select(id,date,state, location, casualty)|>
  group_by(location)|>
  summarise(casualties = sum(casualty))|>
  top_n(10,casualties) |>
  ggplot(aes(x = fct_reorder(location,-casualties), y = casualties, fill = casualties))+
  geom_col()+
  scale_fill_viridis_c(option = "mako", direction = -1) +
  labs(title = "Which 10 cities have the most casualties from mass shootings?",
       subtitle = "In 2024 (till Oct.20 )",
       x = NULL,
       y= "Casualties")+
  geom_text(aes(label = casualties), vjust = -0.5) +
  theme_classic()+
  theme(plot.title = element_text(hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5))+
  theme(legend.position = "none")

ggsave("out/top10_location.png",width = 10, height = 8, dpi = 300)

Fatalities and injuries in Chicago

df_cleaned |>
  filter(location == "Chicago")|>
  summarize(chi_death_num = sum(fatality),
            chi_injury_num = sum(injuries))

# A tibble: 1 × 2
  chi_death_num chi_injury_num
          <dbl>          <dbl>
1            15            117

Incidents’ number in Chicago

df_cleaned |>
  filter(location == "Chicago")|>
  summarize(chi_incidents_num = n())

# A tibble: 1 × 1
  chi_incidents_num
              <int>
1                25

The most severe cases

df_cleaned |>
  filter(casualty == max(casualty) |fatality == max(fatality)|injuries == max(injuries))

# A tibble: 2 × 15
       id date        state location address fatality injuries s_deceased s_hurt
    <dbl> <chr>       <chr> <chr>    <chr>      <dbl>    <dbl>      <dbl>  <dbl>
1 2928467 June 2, 20… Ohio  Akron    Kelly …        1       28          0      0
2 2808531 January 21… Illi… Joliet   200 bl…        8        1          1      0
# ℹ 6 more variables: s_detained <dbl>, operations <lgl>, lat <dbl>,
#   long <dbl>, coor <chr>, casualty <dbl>

Q3. (2)What’s the total casualties by states?

Casualties by detailed locations–scatters on map

df_cleaned <- df_cleaned |>
  mutate(size_num = casualty/4) # to make the point a map standardized, use 4 as division because the definition of mass shootings requires 4 or more victims

usa_map <- map_data("state")

ggplot() +
  geom_polygon(data = usa_map, aes(x = long, y = lat, group = group), fill = "lightgrey") +  
  geom_point(data = df_cleaned, aes(x = long, y = lat, size = size_num), color = "red") +  
  labs(title = "Mass Shootings in America", 
       subtitle = "sclaed by casualties",
       x = "Longitude",
       y = "Latitude") +
  theme_bw()+
  theme(legend.position = "none")+
  theme(plot.title = element_text(hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5))

ggsave("out/shooting_map.png",  width = 6, height = 4, dpi = 300)

Casualties by states & Average victim numbers by incidents per state

Analysis

state_casualties <- df_cleaned |>
  group_by(state) |> 
  summarize(total_casualties = sum(casualty),
            total_occurence = n()) |>
  mutate(victimization_rate = total_casualties/total_occurence) #victimization rate here means average victim numbers per incident

print(state_casualties)

# A tibble: 43 × 4
   state                total_casualties total_occurence victimization_rate
   <chr>                           <dbl>           <int>              <dbl>
 1 Alabama                           139              21               6.62
 2 Alaska                              4               1               4   
 3 Arizona                            14               3               4.67
 4 Arkansas                           43               6               7.17
 5 California                        154              28               5.5 
 6 Colorado                           14               3               4.67
 7 Connecticut                        11               3               3.67
 8 Delaware                            4               1               4   
 9 District of Columbia               58              12               4.83
10 Florida                           126              25               5.04
# ℹ 33 more rows

Visualization –casualties by state

#png("out/casualty_map.png", width = 800, height = 600) 
plot_usmap(data = state_casualties,values = "total_casualties", color = "red")+
  scale_fill_continuous(
    low = "antiquewhite", high = "red", name = "Casualty", label = scales::comma, na.value = "white") + 
  labs(title = "Mass Shooting Casualties",
       subtitle = "U.S. 2024 (till Oct.20)")+
  theme(legend.position = "right")+
   theme(plot.title = element_text(hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5))

#dev.off() #comment the "save" procedure out, or it will not be published in to the website

Visualization – Average victim numbers by incidents per state

#png("out/victimization_map.png", width = 800, height = 600) #save the photo 
plot_usmap(data = state_casualties,values = "victimization_rate", color = "red")+
  scale_fill_continuous(
    low = "white", 
    high = "red", 
    name = "Avg. Victims", 
    label = scales::comma,
    na.value = "white")+
  labs(title = "Average Number of Victims per Mass Shooting Incident",
       subtitle = "U.S. 2024 (till Oct.20)")+
  theme(legend.position = "right")+
   theme(plot.title = element_text(hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5))

#dev.off  #comment the "save" procedure out, or it will not be published in to the website

Q4. The relationship between gun law strength and mass shooting

Create a data frame for mapping state names to abbreviations

state_abbr <- data.frame(
  state_full = c("Alabama", "Alaska", "Arizona", "Arkansas", "California", 
                 "Colorado", "Connecticut", "Delaware", "Florida", "Georgia",
                 "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", 
                 "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland",
                 "Massachusetts", "Michigan", "Minnesota", "Mississippi", 
                 "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire", 
                 "New Jersey", "New Mexico", "New York", "North Carolina", 
                 "North Dakota", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", 
                 "Rhode Island", "South Carolina", "South Dakota", "Tennessee", 
                 "Texas", "Utah", "Vermont", "Virginia", "Washington", 
                 "West Virginia", "Wisconsin", "Wyoming"),
  state_abbr = c("AL", "AK", "AZ", "AR", "CA", 
                 "CO", "CT", "DE", "FL", "GA",
                 "HI", "ID", "IL", "IN", "IA", 
                 "KS", "KY", "LA", "ME", "MD",
                 "MA", "MI", "MN", "MS", "MO", 
                 "MT", "NE", "NV", "NH", "NJ", 
                 "NM", "NY", "NC", "ND", "OH", 
                 "OK", "OR", "PA", "RI", "SC", 
                 "SD", "TN", "TX", "UT", "VT", 
                 "VA", "WA", "WV", "WI", "WY")
)

Merge dataframes

law_shootings <- state_casualties |>
  left_join(df_pop, by = "state") |>
  left_join(df_gunlaw, by = "state")|>
  left_join(state_abbr, by = c("state" = "state_full"))

glimpse(law_shootings)

Rows: 43
Columns: 7
$ state              <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "Califo…
$ total_casualties   <dbl> 139, 4, 14, 43, 154, 14, 11, 4, 58, 126, 102, 5, 16…
$ total_occurence    <int> 21, 1, 3, 6, 28, 3, 3, 1, 12, 25, 19, 1, 32, 10, 1,…
$ victimization_rate <dbl> 6.619048, 4.000000, 4.666667, 7.166667, 5.500000, 4…
$ population         <dbl> 5143033, 733536, 7497004, 3089060, 38889770, 591418…
$ score              <dbl> 12.5, 9.0, 8.5, 3.0, 89.5, 63.0, 82.5, 61.5, NA, 27…
$ state_abbr         <chr> "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", NA,…

Add party affliation

red_states <- c("Alabama", "Alaska", "Arkansas","Arizona",  "Indiana","Iowa",  "Florida","Georgia", "Kansas", "Kentucky", "Louisiana", "Michigan", 
                "Mississippi", "Missouri", "Nebraska","Nevada", "North Carolina" ,"Oklahoma", "Ohio", "Pennsylvania",
                "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", 
                "West Virginia", "Wisconsin")

blue_states <- c("California", "Colorado", "Connecticut", "Delaware", 
                 "Hawaii", "Illinois", "Maine", "Maryland", "Massachusetts",
                 "Minnesota","New Jersey", "New Mexico", "New York", 
                  "Rhode Island", "Virginia", 
                 "Washington")
state_party <- c(rep("Republican", length(red_states)), rep("Democratic", length(blue_states)))
names(state_party) <- c(red_states, blue_states)

Analysis & Visualization

law_shootings |>
  mutate(cas_rate = total_casualties/population * 1000000)|> #use casualties per million people to provides a standardized way to compare gun violence rates
  select(state, state_abbr, cas_rate, score)|>
  filter(state != "District of Columbia")|> # exclude the data for Distirct of Columbia, because its population and gun law strength has not been recorded
  mutate(party = state_party[state]) |>
  ggplot(aes(x = cas_rate, y = score, color = party)) +
  geom_point(size = 2.5, alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, color = "darkgrey") +
  geom_text(aes(label = state_abbr), vjust = -0.9, size = 2) +
  labs(title = "The Relationship Between Mass Shooting Casualties and Gun Law Strength",
       x = "casualties per million people",
       y = "gun law strength score") +
  theme_bw()+
  scale_color_manual(values = c("blue", "red"))+
  theme(plot.title = element_text(hjust = 0.5))

`geom_smooth()` using formula = 'y ~ x'

ggsave("out/scatter_plot.png", width = 12, height = 8, dpi = 300)

`geom_smooth()` using formula = 'y ~ x'