Data
I discovered an error in previous code which made 1960 to appear twice. Hence updated script.setwd('/home/kees/Documents/tycho/')
r1 <- read.csv('MEASLES_Cases_1909-1982_20140323140631.csv',
na.strings='-',
skip=2)
r2 <- reshape(r1,
varying=names(r1)[-c(1,2)],
v.names='Cases',
idvar=c('YEAR' , 'WEEK'),
times=names(r1)[-c(1,2)],
timevar='STATE',
direction='long')
r2$STATE=factor(r2$STATE)
####################3
years <- dir(pattern='+.txt')
years
pop1 <-
lapply(years,function(x) {
rl <- readLines(x)
print(x)
sp <- grep('^U.S.',rl)
st1 <- grep('^AL',rl)
st2 <- grep('^WY',rl)
rl1 <- rl[c(sp[1]-2,st1[1]:st2[1])]
rl2 <- rl[c(sp[2]-2,st1[2]:st2[2])]
read1 <- function(rlx) {
rlx[1] <- paste('abb',rlx[1])
rlx <- gsub(',','',rlx,fixed=TRUE)
rt <- read.table(textConnection(rlx),header=TRUE)
rt[,grep('census',names(rt),invert=TRUE)]
}
rr <- merge(read1(rl1),read1(rl2))
ll <- reshape(rr,
list(names(rr)[-1]),
v.names='pop',
timevar='YEAR',
idvar='abb',
times=as.integer(gsub('X','',names(rr)[-1])),
direction='long')
})
pop <- do.call(rbind,pop1)
pop <- pop[grep('19601',rownames(pop),invert=TRUE),]
states <- rbind(
data.frame(
abb=datasets::state.abb,
State=datasets::state.name),
data.frame(abb='DC',
State='District of Columbia'))
states$STATE=gsub(' ','.',toupper(states$State))
r3 <- merge(r2,states)
r4 <- merge(r3,pop)
r4$incidence <- r4$Cases/r4$pop
r5 <- subset(r4,r4$YEAR>1927,-STATE)
r6 <- r5[complete.cases(r5),]
New variable
In previous posts it became clear there is in general a yearly cycle. However, the minimum in this cycle is in summer. This means for yearly summary it might be best not to use calender years, but rather something which breaks during summer. My choice is week 37.with(r6[r6$WEEK>30 & r6$WEEK<45,],
aggregate(incidence,by=list(WEEK=WEEK),mean))
WEEK x
1 31 0.016757440
2 32 0.013776182
3 33 0.011313391
4 34 0.008783259
5 35 0.007348603
6 36 0.006843930
7 37 0.006528467
8 38 0.007078171
9 39 0.008652546
10 40 0.016784205
11 41 0.013392375
12 42 0.016158805
13 43 0.018391632
14 44 0.021788221
r6$cycle <- r6$YEAR + (r6$WEEK>37)
Plot
States over time
Since not all states have complete data, it was decided to use state-year combinations with at least 40 observations (weeks). As can be seen there is some correlation between states, especially in 1945. If anything, correlation gets weaker past 1955.library(ggplot2)ggplot(with(r6,aggregate(incidence,
list(cycle=cycle,
State=State),
function(x)
if(length(x)>40)
sum(x) else
NA)),
aes(cycle, x,group=State)) +
geom_line(size=.1) +
ylab('Incidence registered Measles Cases Per Year') +
theme(text=element_text(family='Arial')) +
scale_y_log10()
Between states
I have seen too many examples of people rebuilding maps based on traveling times or distances. Now I want to do the same. Proper (euclidean) distance of the states would make the variables the year/week combinations, which gives all kind of scaling issues. What I did is to use correlation and transform that into something distance like. ftime is just a helper variable, so I am sure the reshape works correctly.r6$ftime <- interaction(r6$YEAR,r6$WEEK)
xm <- reshape(r6,
v.names='incidence',
idvar='ftime',
timevar='State',
direction='wide',
drop=c('abb','Cases','pop'))
xm2 <- xm[,grep('incidence',names(xm))]
cc <- cor(xm2,use='pairwise.complete.obs')
dimnames(cc) <- lapply(dimnames(cc),function(x) sub('incidence.','',x))
dd <- as.dist(1-cc/2)
The heatmap reveals the structure best.
heatmap(as.matrix(dd),dist=as.dist,symm=TRUE)
MDS is most nice to look at. I will leave comparisons to the US map to those who actually know all these state's relative locations.
library(MASS)
mdsx <- isoMDS(dd)
par(mai=rep(0,4))
plot(mdsx$points,
type = "n",
axes=FALSE,
xlim=c(-1,1),
ylim=c(-1,1.1))
text(mdsx$points, labels = dimnames(cc)[[1]])