Data
Data is from census.gov. I have taken the first spreadsheet. It is one of those spreadsheets with counts and percentages and empty lines to display categories. Very nice to check some numbers, horrible to process. So, a bit of code to extract the numbers.library(gdata)
r1 <- read.xls('2010_Table_1.xls',stringsAsFactors=FALSE)
# throw out percentages
r2 <- r1[,r1[4,]!='Percent']
# put all column names in one row
r2$X.6[2] <- r2$X.6[3]
r2$X.8[2] <- r2$X.8[3]
# select part with data
r3 <- r2[2:61,c(1,3,5,6)]
names(r3)[1] <- r3[1,1]
r4 <-r3[c(-1:-3),]
#eliminate one row with mean income.
r4 <- r4[-grep('$',r4[,2],fixed=TRUE),]
#reshape in long form
r5 <- reshape(r4,
varying=list(names(r4)[-1]),
v.names='count',
direction='long',
idvar='Characteristic',
timevar='class',
times=r3[1,2:4])
row.names(r5) <- 1:nrow(r5)
# remove ',' from numbers and make numerical values.
# units are in 1000, so update that too
r5$count <- as.numeric(gsub(',','',r5$count))*1000
# clean up numbers used for footnotes
r5$class <- gsub('(1|2|3)','',r5$class)
#some upfront '.' removed.
r5$Characteristic <- gsub('^\\.+','',r5$Characteristic)
# create a factor
r5$Characteristic <- factor(r5$Characteristic,
levels=rev(r5$Characteristic[r5$class=='Home Workers']))
# and create a higher level factor
r5$Mchar=r5$Characteristic
for (i in 1:nrow(r5)) r5$Mchar[i] <-
if(is.na(r5$count[i]) | r5$Mchar[i]=='Total') r5$Mchar[i]
else r5$Mchar[i-1]
Plot
The plot is made using old style graphics. I could not get either ggplot2 or lattice to provide the plot I wanted.# prepare for axis labels
index <- subset(r5,r5$class=='Home Workers',c(Characteristic,Mchar))
index$y=56:1
index2 <- index[index$Characteristic!=index$Mchar | index$Characteristic=='Total',]
index3 <- index[index$Characteristic==index$Mchar & index$Characteristic!='Total',]
r6 <- merge(r5,index)
r6$class <- factor(r6$class)
par(mar=c(5,18,4,2)+.1,cex=.7)
plot(x=r6$count,y=r6$y,axes=FALSE,
xlab='Count',
ylab='',
col=c('red','green','blue')[r6$class],
frame.plot=TRUE,
# log='x',
ylim=c(2,58))
axis(1)
axis(2,at=index2$y,labels=index2$Characteristic,las=1)
text(y=index3$y-.1,x=30000,labels=index3$Characteristic,adj=0)
legend('topleft',legend=levels(r6$class),
ncol=3,col=c('red','green','blue'),
border=NULL,pch=1,
yjust=0)
Why I did not use ggplot2?
The ideal solution for ggplot2 might look something like this:r7 <- r5[!is.na(r5$count),]
r7$Mchar <- factor(r7$Mchar,levels=unique(r7$Mchar))
ggplot(data=r7,
aes(x=Characteristic,y=count,col=class)) +
geom_point()+
coord_flip()+
xlab('')+ylab('')+
ylim(0,max(r5$count))+
facet_wrap(~Mchar,scales='free_x',ncol=2)+
theme(legend.position="bottom")
However, this throws an error:
Error in facet_render.wrap(plot$facet, panel, plot$coordinates, theme, :
ggplot2 does not currently support free scales with a non-cartesian coord or coord_flip.
I also tried the system described here: http://wresch.github.io/2014/05/22/aligning-ggplot2-graphs.html, but I think width has changed in content, could not get that to be satisfactory.
library(gtable)
library(gridExtra)
tt <- as.data.frame(table(r7$Mchar))
tt$Var1
tt$Freq[12] <- tt$Freq[12] +15
la <- lapply(tt$Var1,function(x) {
r8 <- r5[r5$Mchar==as.character(x) ,]
r8 <- r8[ !is.na(r8$count),]
ggplot(data=r8,
aes(x=Characteristic,y=count,col=class)) +
geom_point()+
coord_flip()+
xlab('')+ylab('')+
ylim(0,max(r5$count))
})
# http://wresch.github.io/2014/05/22/aligning-ggplot2-graphs.html
lax <- lapply(la,function(x) x$widths[2:3])
maxwidths <- do.call(grid::unit.pmax,lax)
for(i in 1:12) la[[i]]$widths <- as.list(maxwidths)
la[[12]] <- la[[12]] +
theme(legend.position="bottom",
plot.margin = unit(c(0.01, 0.1, 0.02, 0.1), "null"))
for (i in 1:11) la[[i]] <- la[[i]] +
theme(legend.position="none",
axis.text.x = element_blank(),
axis.title.x = element_blank(),
axis.ticks.x = element_blank(),
plot.margin = unit(c(0.01, 0.1, 0.02, 0.1), "null"))
lag <- lapply(la,ggplotGrob)
g <- gtable_matrix(name = "demo",
grobs = matrix(lag, nrow = 12),
widths = unit(9, "null"),
heights = unit(tt$Freq, "null"))
grid.newpage()
grid.draw(g)
Here is a ggplot2 solution:
ReplyDeleteggplot(data=r7,
aes(y=Characteristic,x=count,col=class)) +
geom_point()+
xlab(NULL)+ylab(NULL)+
xlim(0,max(r5$count))+
facet_grid(Mchar~.,scales="free_y", space = "free")+
theme(legend.position="bottom")
Erwan