\documentclass[a4paper]{article} \begin{document} \title{Statistics II: Histograms in R} \author{Richard Gill\thanks{{\tt http://www.math.leidenuniv.nl/$\sim$gill/teaching/statistics}}} \date{\today} \maketitle \noindent These scripts and notes illustrate the histogram as a density estimator. First we make some data. And before that, I set the random seed, so that the results will be perfectly reproducible. % <<>>= set.seed(11091951) x<-rgamma(100,10) @ % Now we show the plain vanilla R histogram. % % \begin{center} <>= hist(x) @ \end{center} % \newpage\noindent The area under the graph was equal to the number of observations. Using the option prob=TRUE we make the area under the graph equal to one. On this graph I will superimpose a plot of the true density of the data. I'll also fix the axes to get a nicer plot. The bin-width is determined by Sturges' method, the default, it doesn't hurt to make that explicit. % \begin{center} <>= hist(x,prob=T,xlim=c(0,30),ylim=c(0,0.15),breaks="Sturges") xd<-seq(from=0,to=30,length=1000) yd<-dgamma(xd,shape=10) lines(xd,yd) abline(h=0) @ \end{center} % \newpage\noindent Now let's look at the results of the other bin-width algorithms. First, Scott: % \begin{center} <>= hist(x,prob=T,xlim=c(0,30),ylim=c(0,0.15),breaks="Scott") lines(xd,yd) abline(h=0) @ \end{center} % \newpage\noindent And Freedman-Diaconis: % \begin{center} <>= hist(x,prob=T,xlim=c(0,30),ylim=c(0,0.15),breaks="FD") lines(xd,yd) abline(h=0) @ \end{center} % \newpage\noindent Not much to see, right? But I'll draw these all again, but now with an outlier added to the dataset. First, plain vanilla (Sturges): % \begin{center} <>= hist(c(x,50),prob=T,xlim=c(0,50),ylim=c(0,0.15),breaks="Sturges") lines(xd,yd) abline(h=0) @ \end{center} % \newpage\noindent Then Scott: % \begin{center} <>= hist(c(x,50),prob=T,xlim=c(0,50),ylim=c(0,0.15),breaks="Scott") lines(xd,yd) abline(h=0) @ \end{center} % \newpage\noindent And Freedman-Diaconis % \begin{center} <>= hist(c(x,50),prob=T,xlim=c(0,50),ylim=c(0,0.15),breaks="Freedman-Diaconis") lines(xd,yd) abline(h=0) @ \end{center} % \newpage\noindent Venables and Ripley think they have a much better histogram. Let's take a look. Scott: % \begin{center} <>= library(MASS) truehist(c(x,50),prob=T,xlim=c(0,50),ylim=c(0,0.15),nbins="Scott") lines(xd,yd) abline(h=0) @ \end{center} % \newpage\noindent And Freedman-Diaconis % \begin{center} <>= truehist(c(x,50),prob=T,xlim=c(0,50),ylim=c(0,0.15),nbins="Freedman-Diaconis") lines(xd,yd) abline(h=0) @ \end{center} % \bigskip \noindent{\bf Exercise}. Figure out how to draw a frequency polygon (join the mid-points of histogram bars by straight lines...). Theoretically you can use a larger bin-width, at least, if the true density is twice differentiable. \end{document} help(hist) help(rgamma) x<-rgamma(100,10) hist(x) hist(x,prob=T) y<-seq(from=0,to=30,length=1000) lines(y,dgamma(y,shape=10)) hist(x,prob=T,breaks="Scott") lines(y,dgamma(y,shape=10)) hist(x,prob=T,breaks="FD") lines(y,dgamma(y,shape=10)) hist(c(x,50),prob=T,breaks="FD") lines(y,dgamma(y,shape=10)) hist(c(x,50),prob=T,breaks="FD",ylim=c(0,0.15)) lines(y,dgamma(y,shape=10)) hist(c(x,50),prob=T,breaks="Scott",ylim=c(0,0.15)) lines(y,dgamma(y,shape=10)) hist(c(x,50),prob=T,breaks="Sturges",ylim=c(0,0.15)) lines(y,dgamma(y,shape=10)) library(MASS) truehist(c(x,50),prob=T,ylim=c(0,0.15)) lines(y,dgamma(y,shape=10)) truehist(c(x,50),prob=T,nbins="FD",ylim=c(0,0.15)) lines(y,dgamma(y,shape=10)) truehist(c(x,50),prob=T,nbins="Scott",ylim=c(0,0.15)) lines(y,dgamma(y,shape=10)) truehist(c(x,50),prob=T,nbins="FD",ylim=c(0,0.15)) lines(y,dgamma(y,shape=10))