Data Cleaning

Set up

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Mass shooting data

Import data

df<-read_csv("data/massshootings.csv")
Rows: 427 Columns: 14
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (5): Incident Date, State, City Or County, Address, Coordinates_Found
dbl (8): Incident ID, Victims Killed, Victims Injured, Suspects Killed, Susp...
lgl (1): Operations

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Inspect data

#head(df) #comment the line out or it will affect publishing
glimpse(df)
Rows: 427
Columns: 14
$ `Incident ID`       <dbl> 3052758, 3052028, 3051984, 3051041, 3050940, 30517…
$ `Incident Date`     <chr> "October 21, 2024", "October 20, 2024", "October 2…
$ State               <chr> "Washington", "Tennessee", "Louisiana", "Pennsylva…
$ `City Or County`    <chr> "Fall City", "Jackson", "Baton Rouge", "Philadelph…
$ Address             <chr> "7700 block of Lake Alice Rd SE", "2310 N Highland…
$ `Victims Killed`    <dbl> 5, 1, 0, 0, 3, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 2,…
$ `Victims Injured`   <dbl> 1, 8, 5, 7, 8, 4, 10, 3, 9, 14, 7, 4, 4, 5, 5, 4, …
$ `Suspects Killed`   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ `Suspects Injured`  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ `Suspects Arrested` <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 2, 3, 1, 0, 0, 0, 1, 3, 1,…
$ Operations          <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Latitude            <dbl> 47.56812, 35.61390, 30.44335, 39.95222, 33.11464, …
$ Longitude           <dbl> -121.89086, -88.81940, -91.18664, -75.16218, -90.0…
$ Coordinates_Found   <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "…
unique(df$Coordinates_Found) # to see if there were any missing values
[1] "Yes"

Data cleaning

Rename

df_cleaned <- df |>
  rename(id = `Incident ID`,
         date = `Incident Date`,
         state = State,
         location = `City Or County`,
         address = Address,
         fatality = `Victims Killed`,
         injuries = `Victims Injured`,
         s_deceased = `Suspects Killed`,
         s_hurt = `Suspects Injured`,
         s_detained = `Suspects Arrested`,
         operations = Operations,
         lat = Latitude,
         long = Longitude,
         coor = Coordinates_Found)|>
  mutate(casualty = fatality + injuries) # casualty refers to a person who is injured or killed 
glimpse(df_cleaned)
Rows: 427
Columns: 15
$ id         <dbl> 3052758, 3052028, 3051984, 3051041, 3050940, 3051717, 30516…
$ date       <chr> "October 21, 2024", "October 20, 2024", "October 20, 2024",…
$ state      <chr> "Washington", "Tennessee", "Louisiana", "Pennsylvania", "Mi…
$ location   <chr> "Fall City", "Jackson", "Baton Rouge", "Philadelphia", "Lex…
$ address    <chr> "7700 block of Lake Alice Rd SE", "2310 N Highland Ave", "9…
$ fatality   <dbl> 5, 1, 0, 0, 3, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 2, 1, 1, 1,…
$ injuries   <dbl> 1, 8, 5, 7, 8, 4, 10, 3, 9, 14, 7, 4, 4, 5, 5, 4, 3, 3, 3, …
$ s_deceased <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ s_hurt     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ s_detained <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 2, 3, 1, 0, 0, 0, 1, 3, 1, 0, 1, 0,…
$ operations <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ lat        <dbl> 47.56812, 35.61390, 30.44335, 39.95222, 33.11464, 31.58140,…
$ long       <dbl> -121.89086, -88.81940, -91.18664, -75.16218, -90.05281, -84…
$ coor       <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Ye…
$ casualty   <dbl> 6, 9, 5, 7, 11, 5, 11, 4, 10, 15, 8, 5, 4, 5, 6, 4, 5, 4, 4…

Convert character to date

Create month column

df_time <- df_cleaned |>
  mutate(date_new = mdy(date),  #mdy means month_day_year
         month_num = month(date_new)) #month function returns the results as number
month_names <- c("January", "February", "March", "April", "May", "June", 
                 "July", "August", "September", "October", "November", "December") #create a mapping system for the number and month to correspond
df_time <- df_time |>
  mutate(month = factor(month_names[month_num], levels = month_names)) #turn the value of new column into factors

Create day of the week column

translate_weekday <- function(cn_day) { 
  weekdays_map <- c("星期一" = "Monday",
                    "星期二" = "Tuesday",
                    "星期三" = "Wednesday",
                    "星期四" = "Thursday",
                    "星期五" = "Friday",
                    "星期六" = "Saturday",
                    "星期日" = "Sunday")
  
  return(weekdays_map[cn_day]) #create a function to return English name of the day based on Chinese name given
}
df_time <- df_time |>
  mutate(cn_weekday = weekdays(date_new))|> # my computer system is in Chinese, so the weekdays funtion only returns day of the week in Chinese
  mutate(en_weekday = translate_weekday(cn_weekday))

Population data

Import

population <- read_csv("data/population2024.csv")
Rows: 50 Columns: 7
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): US State
dbl (6): Rank, Population 2024, Population 2023, Growth Rate, % of US, Densi...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Cleaning

df_pop <- population |>
  rename(state = `US State`,
         population = `Population 2024`)|>
  select(state, population)

Inspect data

glimpse(df_pop)
Rows: 50
Columns: 2
$ state      <chr> "California", "Texas", "Florida", "New York", "Pennsylvania…
$ population <dbl> 38889770, 30976754, 22975931, 19469232, 12951275, 12516863,…

Gun control data

Import

df_gunlaw <- read_csv("data/gun_law_strength.csv")
Rows: 50 Columns: 2
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): state
dbl (1): composite score

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Cleaning

df_gunlaw <- df_gunlaw |>
  rename(score = `composite score`)

Inspect data

glimpse(df_gunlaw)
Rows: 50
Columns: 2
$ state <chr> "California", "New York", "Illinois", "Connecticut", "Massachuse…
$ score <dbl> 89.5, 83.5, 83.0, 82.5, 81.0, 79.5, 79.0, 75.0, 69.0, 68.0, 63.0…

Save the cleaned data

saveRDS(df_cleaned, "data/df_cleaned.rds")
saveRDS(df_time, "data/df_time.rds")
saveRDS(df_pop, "data/df_pop.rds")
saveRDS(df_gunlaw, "data/df_gunlaw.rds")