Data
Data sits in two pages on shootingtracker, 2013 and 2014. In preparation for this post I copied/pasted those data in notepad and removed headers and footers. In the 2013 data I kept names of columns. The first steps of reading the data are removing some of the (for me) extraneous info, such as reference where the data came from. Subsequently the state and town are separated and a few records which do not have the correct state abbreviation are corrected. Finally, 13 is reformatted to 2013 and date is created. The last record used is from July 9th, 2014.r13 <- readLines('raw13.txt')
r14 <- readLines('raw14.txt')
r1 <- c(r13,r14)
head(r1)
[1] "Number\t Date\t Alleged Shooter\t Killed\t Wounded\t Location\t References"
[2] "1\t1/1/13\tCarlito Montoya\t4\t0\tSacramento, CA\t"
[3] " [Expand] "
[4] "2\t1/1/13\tUnknown\t1\t3\tHawthorne, CA\t"
[5] " [Expand] "
[6] "3\t1/1/13\tJulian Sims\t0\t4\tMcKeesport, PA\t"
tail(r1)
[1] "141\t7/8/2014\tUnknown\t1\t4\tSan Bernardino, CA\t"
[2] " [Expand] "
[3] "142\t7/8/2014\tUnknown\t0\t5\tProvidence, RI\t"
[4] " [Expand] "
[5] "143\t7/9/2014\tRonald Lee Haskell\t6\t1\tHouston, TX\t"
[6] " [Expand] "
r2 <- gsub('\\[[a-zA-Z0-9]*\\]','',r1)
r3 <- gsub('^ *$','',r2)
r4 <- r3[r3!='']
r5 <- gsub('\\t$','',r4)
r6 <- gsub('\\t References$','',r5)
r7 <- read.table(textConnection(r6),
sep='\t',
header=TRUE,
stringsAsFactors=FALSE)
r7$Location[r7$Location=='Washington DC'] <-
'Washington, DC'
r8 <- read.table(textConnection(as.character(r7$Location)),
sep=',',
col.names=c('Location','State'),
stringsAsFactors=FALSE)
r8$State <- gsub(' ','',r8$State)
r8$State[r8$State=='Tennessee'] <- 'TN'
r8$State[r8$State=='Ohio'] <- 'OH'
r8$State[r8$State=='Kansas'] <- 'KS'
r8$State[r8$State=='Louisiana'] <- 'LA'
r8$State[r8$State=='Illinois'] <- 'IL'
r8$State <- toupper(r8$State)
r7$State <- r8$State
r7$Location <- r8$Location
r7 <- r7[r7$State != 'PUERTORICO',]
Sys.setlocale(category = "LC_TIME", locale = "C")
r7$Date <- gsub('/13$','/2013',r7$Date)
r7$date <- as.Date(r7$Date,format="%m/%d/%Y")
Effect of day and month
Effect of day of the week is pretty easy to plot, just add the day and run qplot(weekday). In this case complexities arise because I want the days in a specific order, Monday to Sunday. Second is that not all days occur equally often in the test period. This is not enough to invalidate the plot, but since I had to correct for occurrence of months for a similar plot, I decided to reuse that code for weekdays. The data.frame alldays is used to calculate the number of days in the data set. I am not going to over analyze this, Sundays stick out in a negative way.
library(ggplot2)
r7$weekday <- factor(format(r7$date,'%a'),
levels=format(as.Date('07/07/2014',format="%m/%d/%Y")+0:6,'%a'))
r7$month <- factor(format(r7$date,'%b'),
levels=format(as.Date('01/15/2014',format="%m/%d/%Y")+
seq(0,length.out=12,by=30),'%b'))
alldays <- data.frame(
date=seq(min(r7$date),max(r7$date),by=1))
alldays$weekday <- factor(format(alldays$date,'%a'),
levels=levels(r7$weekday))
alldays$month <- factor(format(alldays$date,'%b'),
levels=format(as.Date('01/15/2014',format="%m/%d/%Y")+
seq(0,length.out=12,by=30),'%b'))
ggplot(data=data.frame(prop=as.numeric(table(r7$weekday)/
table(alldays$weekday)),
weekday=factor(levels(r7$weekday),
levels=levels(r7$weekday))),
aes(y=prop,x=weekday)) +
geom_bar(stat='identity') +
ylab('Shootings per day') +
xlab('Day of the week')
In terms of months, it seems summer is worse than the other seasons and winter is best.
ggplot(data=data.frame(prop=as.numeric(table(r7$month)/
table(alldays$month)),
month=factor(levels(r7$month),
levels=levels(r7$month))),
aes(y=prop,x=month)) +
geom_bar(stat='identity') +
ylab('Shootings per day') +
xlab('Month')
A model
It is not obvious, given the unequal distributions of weekdays over months, how significant a month effect is. To examine this, I have reorganized the data to display shootings per day. Data frame alldays is used again, now to ensure data with no shootings are correctly represented. The modeling shows a clear effect of months and the interaction of days and months on the brink of significance.
r7$one <- 1
ag <- aggregate(r7$one,
by=list(date=r7$date),FUN=sum)
counts <- merge(alldays,ag,all=TRUE)
counts$x[is.na(counts$x)] <- 0
g0 <- glm(x ~ weekday ,data=counts,
family='poisson')
g1 <- glm(x ~ weekday + month,data=counts,
family='poisson')
g2 <- glm(x ~ weekday * month,data=counts,
family='poisson')
anova(g0,g1,g2,test='Chisq')
Analysis of Deviance Table
Model 1: x ~ weekday
Model 2: x ~ weekday + month
Model 3: x ~ weekday * month
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 548 653.86
2 537 624.86 11 29.008 0.002264 **
3 471 538.99 66 85.870 0.050718 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Predictions
To understand the interaction, a plot is made of the expected values by day and month. Looking at this plot, the Sunday effect is most pronounced in Summer. In addition I would not be surprised if next Sunday has four shootings again.
pred1 <- expand.grid(weekday=factor(levels(r7$weekday),
levels=levels(r7$weekday)),
month=factor(levels(r7$month),
levels=levels(r7$month)))
preds <- predict(g2,pred1,
type='response',
se=TRUE)
pred1$fit <- preds$fit
pred1$se.fit <- preds$se.fit
limits <- aes(ymax = fit+se.fit, ymin=fit-se.fit)
p <- ggplot(pred1, aes(fill=weekday, y=fit, x=month))
dodge <- position_dodge(width=0.9)
p + geom_bar(position=dodge, stat="identity") +
geom_errorbar(limits, position=dodge, width=0.25) +
theme(legend.position = "bottom") +
ylab('Shootings per Day') +
guides (fill=guide_legend('Day'))
Follow the example?
One might ask if shootings which have much media attention give cause to copycats. This is not easy to analyze, given that clearly time effects from day and month exist. Besides, which shootings get a lot of media attention? Yet we can look at the number of shootings over time and at least add the shootings with most victims in the plot. The number of victims to be marked has arbitrarily been chosen as 18 or more. In this plot I cannot see the connection.
r7$Victims <- r7$Killed+r7$Wounded
table(r7$Victims)
4 5 6 7 8 9 10 12 13 14 18 19 20 21
322 104 25 26 11 5 4 2 2 2 1 1 1 1
p <- ggplot(counts, aes(y=x,x=date))
p + stat_smooth(span=.11) +
geom_point() +
geom_vline(xintercept=as.numeric(r7$date[r7$Victims>15])) +
ylab('shootings per day')
An ACF
Mostly because making an ACF is integral part of analyzing time series. Here it is, in case anybody doubted a week effect. I chose this fairly long lag, because 10 weeks seemed nice to me.
plot(acf(counts$x,lag.max=70))
No comments:
Post a Comment