# Load some useful libraries library(ggplot2) library(plyr) library(dplyr) library(tidyr) # Fivethirtyeight is a company full of nerds (like us!) that have compiled lots of interesting data # US births between 2000 - 2014 births <- read.csv('https://raw.githubusercontent.com/fivethirtyeight/data/master/births/US_births_2000-2014_SSA.csv', header=T) # Take a look at the data head(births) # Add a column with a sequential index for plotting purposes births$birthdate <- 1:nrow(births) # Plot how birth number of births have changed over time ggplot(births, aes(x=birthdate, y=births)) + geom_line() # Funny periodicity! Let's split this by day of the week ggplot(births, aes(x=birthdate, y=births, color=day_of_week)) + geom_point() # Something funny is still going on here. What do day_of_week numbers correspond to? # Hint: http://www.onthisday.com/date/2000/january/1 # Look at our data another way, and note that day_of_week is an int (makes more sense as factor) tbl_df(births) births$day_of_week <- factor(births$day_of_week) levels(births$day_of_week) <- c('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday') # Try plotting, now with factors ggplot(births, aes(x=birthdate, y=births, color=day_of_week)) + geom_point() # Looks like people don't like having babies on the weekends # Let's look just at the year 2000 to try to figure out what the periodicity is all about y2k <- subset(births, year==2000) # Now you try: how many days are in the year 2000? # Now you try: what is the average number of births in the year 2000? # Look at some statistics by day of the week. births %>% group_by(day_of_week) %>% mutate(mean_births=mean(births)) # Looks like weekends are an unpopular time to have a baby t.test(subset(births, day_of_week %in% c('Saturday', 'Sunday'))$births, subset(births, !(day_of_week %in% c('Saturday', 'Sunday')))$births) # Now you try: Are Saturday vs Sunday births significantly different? # Plot single year's worth of births ggplot(y2k, aes(x=birthdate, y=births, color=day_of_week)) + geom_point() # Something funny going on at the end of the year. Same for 2001? # Now you try: make a variable called year2001 with data only from the year 2001. # Plot year 2001's data to see if end of the year also looks weird ggplot(year2001, aes(x=birthdate, y=births, color=day_of_week)) + geom_point() # Looks like people try to avoid having babies right around Christmas. # How many births were there over 15 years on Christmas? subset(births, month==12&date_of_month==25)$births # How many births were there over 15 years on June 25th? subset(births, month==6&date_of_month==25)$births # Let's see if there were more births on June 25 than December 25 (multiple criteria here!) t.test(subset(births, month==6&date_of_month==25)$births, subset(births, month==12&date_of_month==25)$births) # Now you try: are there fewer births on Christmas and Xmas eve than New Year's Eve and New Year's? # Silly superstition: Friday the 13th is unlucky. # Now you try: do people have fewer births on Friday the 13th than other random Fridays? # Prep data for plotting friday <- subset(births, day_of_week=='Friday') %>% mutate(the13 = ifelse(date_of_month==13, T, F)) # Look at how many Friday the 13th's there are in our data table(friday$the13) # Make a violin plot comparing Friday the 13th vs other Fridays ggplot(friday, aes(x=the13, y=births)) + geom_violin() # Extra super bonus # Now you try: how many Friday dates aside from the 13th had fewer births than the average Friday the 13th? # Now read a fun article reflecting some analyses here: # https://fivethirtyeight.com/features/some-people-are-too-superstitious-to-have-a-baby-on-friday-the-13th/