## 1 Keen Chapter 4

Graphics out of the book Graphics for Statistics and Data Analysis with R by Kevin Keen (book home page)

### 1.1 Figure 4.17 EDF plot

• first set up the data frame:
• we use the ecdf() function to get create a stepfunction ecdfmass()
• from this function we can extract the knots (which will be mapped to the x-axis)
• using this knots as arguments in ecdfmass() we'll get the belonging probabilities (which we will map to the y-axis)
• the end column in df contains the end points of the horizontal lines in the step function - it's only the knots vector beginning with the second element and setting the last element to NA

mass<-c(5.9,32.0,40.0,51.5,70.0,100.0,78.0,80.0,85.0,85.0,
110.0,115.0,125.0,130.0,120.0,120.0,130.0,135.0,110.0,130.0,
150.0,145.0,150.0,170.0,225.0,145.0,188.0,180.0,197.0,218.0,
300.0,260.0,265.0,250.0,250.0,300.0,320.0,514.0,556.0,840.0,
685.0,700.0,700.0,690.0,900.0,650.0,820.0,850.0,900.0,1015.0,
820.0,1100.0,1000.0,1100.0,1000.0,1000.0)

ecdfmass <- ecdf(mass)
kn <- knots(ecdfmass)
ed <- ecdfmass(kn)

df <- data.frame(knots=kn,ed=ed,end=c(kn[-1],NA))


  knots         ed  end
1   5.9 0.01785714 32.0
2  32.0 0.03571429 40.0
3  40.0 0.05357143 51.5
4  51.5 0.07142857 70.0
5  70.0 0.08928571 78.0
6  78.0 0.10714286 80.0


• now we first set the aesthetics for the points: x to knots and y to ed
• add the point layer and setting the point size to 3: geom_point(size=3)
• add the lines using a segment layer setting the aesthetics xend and yend: geom_segment(aes(xend=end,yend=ed))
• in the next step we add two addition segment layers, one for each arrow; you can also use annotate() to do this; inside these segments we use the arrow() function from the grid package, so we can define the appearance of our arrows
• the next two lines change the appearance of the axes: setting the limits, the breaks and the expansion
• then set the appropriate axes titles and customize axis elements and panel.background

require(grid) ## for the arrow() function
ggplot(df,aes(x=knots,y=ed)) +
geom_point(size=3) +
geom_segment(aes(xend=end,yend=ed)) +
geom_segment(x=min(df$knots),xend=-45,y=0,yend=0,arrow=arrow(length = unit(0.15,"cm")),size=c(0.4)) + geom_segment(x=max(df$knots),xend=1245,y=1,yend=1,arrow=arrow(length = unit(0.15,"cm")),size=c(0.4)) +
scale_x_continuous(limits=c(-45,1245),breaks=seq(0,1200,by=200),expand=c(0,0)) +
scale_y_continuous(limits=c(-0.01,1.05),breaks=seq(0,1,by=0.2),expand=c(0,0)) +
xlab("Mass (g)") +
ylab("Empirical Distribution Function") +
theme(
panel.background=element_blank(),
axis.line=element_line(colour="black"),
axis.text=element_text(colour="black",size=14),
axis.title=element_text(colour="black",size=14),
axis.ticks=element_line(colour="black"),
axis.ticks.length=unit(3,"mm")
)


### 1.2 Figure 4.18 EDF plot

• we only have to change the axis breaks of the y-axis and add the horizontal lines
• the first is done changing the by argument in the seq() to 0.25 in scale_y_continuous()
• then we add a hline layer

ggplot(df,aes(x=knots,y=ed)) +
geom_point(size=3) +
geom_segment(aes(xend=end,yend=ed)) +
geom_segment(x=min(df$knots),xend=-45,y=0,yend=0,arrow=arrow(length = unit(0.15,"cm")),size=c(0.4)) + geom_segment(x=max(df$knots),xend=1245,y=1,yend=1,arrow=arrow(length = unit(0.15,"cm")),size=c(0.4)) +
geom_hline(yintercept=c(0.25,0.5,0.75),linetype=2) +
scale_x_continuous(limits=c(-45,1245),breaks=seq(0,1200,by=200),expand=c(0,0)) +
scale_y_continuous(limits=c(-0.01,1.05),breaks=seq(0,1,by=0.25),expand=c(0,0)) +
xlab("Mass (g)") +
ylab("Empirical Distribution Function") +
theme(
panel.background=element_blank(),
axis.line=element_line(colour="black"),
axis.text=element_text(colour="black",size=14),
axis.title=element_text(colour="black",size=14),
axis.ticks=element_line(colour="black"),
axis.ticks.length=unit(3,"mm")
)


• we also can use the grid lines (but then we have also lines at 0 and 1

ggplot(df,aes(x=knots,y=ed)) +
geom_point(size=3) +
geom_segment(aes(xend=end,yend=ed)) +
geom_segment(x=min(df$knots),xend=-45,y=0,yend=0,arrow=arrow(length = unit(0.15,"cm")),size=c(0.4)) + geom_segment(x=max(df$knots),xend=1245,y=1,yend=1,arrow=arrow(length = unit(0.15,"cm")),size=c(0.4)) +
scale_x_continuous(limits=c(-45,1245),breaks=seq(0,1200,by=200),expand=c(0,0)) +
scale_y_continuous(limits=c(-0.01,1.05),breaks=seq(0,1,by=0.25),expand=c(0,0)) +
xlab("Mass (g)") +
ylab("Empirical Distribution Function") +
theme(
panel.background=element_blank(),
panel.grid.major.x=element_blank(),
panel.grid.major.y=element_line(linetype = 2,colour="grey50"),
axis.line=element_line(colour="black"),
axis.text=element_text(colour="black",size=14),
axis.title=element_text(colour="black",size=14),
axis.ticks=element_line(colour="black"),
axis.ticks.length=unit(3,"mm")
)


### 1.3 Figure 4.19 EDF plot

• replace geom_point() by geom_step()
• get rid of the horizontal lines
• add another little segment which connects the left arrow with the step function: geom_segment(x=min(df$knots),xend=min(df$knots),y=0,yend=min(df$ed),size=0.4) • leave everything as it is ggplot(df,aes(x=knots,y=ed)) + geom_step(direction = "hv") + geom_segment(x=min(df$knots),xend=-45,y=0,yend=0,arrow=arrow(length = unit(0.15,"cm")),size=c(0.4)) +
geom_segment(x=max(df$knots),xend=1245,y=1,yend=1,arrow=arrow(length = unit(0.15,"cm")),size=c(0.4)) + geom_segment(x=min(df$knots),xend=min(df$knots),y=0,yend=min(df$ed),size=0.4) +
scale_x_continuous(limits=c(-45,1245),breaks=seq(0,1200,by=200),expand=c(0,0)) +
scale_y_continuous(limits=c(-0.01,1.05),breaks=seq(0,1,by=0.2),expand=c(0,0)) +
xlab("Mass (g)") +
ylab("Empirical Distribution Function") +
theme(
panel.background=element_blank(),
axis.line=element_line(colour="black"),
axis.text=element_text(colour="black",size=14),
axis.title=element_text(colour="black",size=14),
axis.ticks=element_line(colour="black"),
axis.ticks.length=unit(3,"mm")
)


• instead of using the data frame created above you can use the original data (mass) and set stat to ecdf
• but for the arrows you need to calculate the vals anyway, that's why I use the data frame above

df3 <- data.frame(mass=mass)
ggplot(df,aes(x=knots,y=ed)) +
geom_step(inherit.aes=F,stat="ecdf",data=df3,aes(x=mass)) +
geom_segment(x=min(df$knots),xend=-45,y=0,yend=0,arrow=arrow(length = unit(0.15,"cm")),size=c(0.4)) + geom_segment(x=max(df$knots),xend=1245,y=1,yend=1,arrow=arrow(length = unit(0.15,"cm")),size=c(0.4)) +
geom_segment(x=min(df$knots),xend=min(df$knots),y=0,yend=min(df$ed),size=0.4) + scale_x_continuous(limits=c(-45,1245),breaks=seq(0,1200,by=200),expand=c(0,0)) + scale_y_continuous(limits=c(-0.01,1.05),breaks=seq(0,1,by=0.2),expand=c(0,0)) + xlab("Mass (g)") + ylab("Empirical Distribution Function") + theme( panel.background=element_blank(), axis.line=element_line(colour="black"), axis.text=element_text(colour="black",size=14), axis.title=element_text(colour="black",size=14), axis.ticks=element_line(colour="black"), axis.ticks.length=unit(3,"mm") )  ### 1.4 Figure 4.20 EDF plot • the last plot only with the horizontal grid lines add the quartiles ggplot(df,aes(x=knots,y=ed)) + geom_step(direction = "hv") + geom_segment(x=min(df$knots),xend=-45,y=0,yend=0,arrow=arrow(length = unit(0.15,"cm")),size=c(0.4)) +
geom_segment(x=max(df$knots),xend=1245,y=1,yend=1,arrow=arrow(length = unit(0.15,"cm")),size=c(0.4)) + geom_segment(x=min(df$knots),xend=min(df$knots),y=0,yend=min(df$ed),size=0.4) +
geom_hline(yintercept=c(0.25,0.5,0.75),linetype=2) +
scale_x_continuous(limits=c(-45,1245),breaks=seq(0,1200,by=200),expand=c(0,0)) +
scale_y_continuous(limits=c(-0.01,1.05),breaks=seq(0,1,by=0.25),expand=c(0,0)) +
xlab("Mass (g)") +
ylab("Empirical Distribution Function") +
theme(
panel.background=element_blank(),
axis.line=element_line(colour="black"),
axis.text=element_text(colour="black",size=14),
axis.title=element_text(colour="black",size=14),
axis.ticks=element_line(colour="black"),
axis.ticks.length=unit(3,"mm")
)


### 1.5 Figure 4.20 EDF plot with cumulative normal distribution function added

• first we need to create a second data frame containing the values defining the curve, we choose to use 10000 points on the x-axis and use pnorm() to calculate to respective y values (using the empirical mean and the empirical sd of the vector mass)
• then we add the layer (geom_line())
• change the limits of the x-axis and the breaks inside scale_x_continuous()
• change the length of the right arrow (setting xend to 1500)

mean_mass<-mean(mass)
sd_mass<-sd(mass)
min_mass<-min(mass)
max_mass<-1500

xx <- seq(0,10000,1)*(max_mass-min_mass)/10000.+min_mass
yy <- pnorm(xx,mean_mass,sd_mass)

df2 <- data.frame(xx=xx,yy=yy)

ggplot(df,aes(x=knots,y=ed)) +
geom_step(direction = "hv") +
geom_segment(x=min(df$knots),xend=-45,y=0,yend=0,arrow=arrow(length = unit(0.15,"cm")),size=c(0.4)) + geom_segment(x=max(df$knots),xend=1500,y=1,yend=1,arrow=arrow(length = unit(0.15,"cm")),size=c(0.4)) +
geom_segment(x=min(df$knots),xend=min(df$knots),y=0,yend=min(df\$ed),size=0.4) +
geom_line(data=df2,aes(x=xx,y=yy)) +
scale_x_continuous(limits=c(-45,1500),breaks=seq(0,1500,by=500),expand=c(0,0)) +
scale_y_continuous(limits=c(-0.01,1.05),breaks=seq(0,1,by=0.2),expand=c(0,0)) +
xlab("Mass (g)") +
ylab("Empirical Distribution Function") +
theme(
panel.background=element_blank(),
axis.line=element_line(colour="black"),
axis.text=element_text(colour="black",size=14),
axis.title=element_text(colour="black",size=14),
axis.ticks=element_line(colour="black"),
axis.ticks.length=unit(3,"mm")
)