Thursday, July 9, 2015

label outlier in ggplot2 boxplot

  • function to add labels to outliers in a ggplot2 boxplot
  • the function add.outlier() takes a ggplot boxplot object as input
  • the second optional input is a string containing the name of the variable containing the labels, the default is the value itself
  • the function expects a unique mapping to x and y, where x is a factor variable
  • the data frame given to the ggplot object must contain the x, y, and the labelling variable
require(ggplot2)

mtcars$cyl <- factor(mtcars$cyl)
mtcars$labels <- row.names(mtcars)

p <- ggplot(mtcars,aes(x=cyl,colour=cyl,y=qsec)) +
    geom_boxplot()



add.outlier <- function(p,labvar = as.character(p$mapping$y)){
      df <- data.frame(y = with(p$data,eval(p$mapping$y)),
                       x = with(p$data,eval(p$mapping$x)))
  
      df.l <- split(df,df$x)
      
      mm <- Reduce(rbind, lapply(df.l,FUN = function(df){
                                     data.frame(y = df$y[df$y <= (quantile(df$y)[2] - 1.5 * IQR(df$y)) | df$y >= (quantile(df$y)[4] + 1.5 * IQR(df$y))],
                                                x = df$x[df$y <= (quantile(df$y)[2] - 1.5 * IQR(df$y)) | df$y >= (quantile(df$y)[4] + 1.5 * IQR(df$y))]
                                                )})
                   )
  
      
      mm$x <- factor(mm$x,levels=sort(as.numeric(as.character(unique(p$data[,as.character(p$mapping$x)])))),
                     labels = levels(p$data[,as.character(p$mapping$x)])
                     )
      
      names(mm) <- c(as.character(p$mapping$y),as.character(p$mapping$x))
      mm <- merge(p$data[,c(names(mm),labvar)],mm)
      
      p + geom_text(data=mm,
                    aes_string(label=labvar),
                    vjust = -0.5)
}

add.outlier(p)





add.outlier(p,"labels")



Saturday, April 25, 2015

R - read excel files without java or perl dependencies

1 read Excel files without java or perl dependencies

With the new package from hadley wickham it is finally possible to read excel files without java or excel dependencies, yeah… For now there are only two functions:

  • excel_sheets() to get all sheets contained in an excel file
  • read_excel() to read one sheet out of an excel file
  • get the filenames of excel files contained in the current directory

require(readxl)
dir(pattern = "xls")

[1] "20140927excelfile.xlsx" "example.xlsx"

  • get the names of the excel sheets contained in the excel file (only one in this case)
  • the function has the file name resp. the whole path as argument


excel_sheets("example.xlsx")

[1] "conspvalues"

  • read in the sheet, the function takes the path and the sheet as argument
  • the sheet can be given as position (first sheet = 1, second = 2, etc) or as name (the first sheet is the default)


xx <- read_excel("example.xlsx",1)
head(xx)

mpg cyl    disp  hp drat      wt  qsec vs am gear carb
1 21.0   6     160 110 3.90    2.62 16.46  0  1    4    4
2 21.0   6 missing 110 3.90   2.875 17.02  0  1    4    4
3 22.8   4     108  93 3.85    2.32 18.61  1  1    4    1
4 21.4   6     258 110 3.08 missing 19.44  1  0    3    1
5 18.7   8     360 175 3.15    3.44 17.02  0  0    3    2
6 18.1   6     225 105 2.76    3.46 20.22  1  0    3    1


xx <- read_excel("example.xlsx","conspvalues")
head(xx)

mpg cyl    disp  hp drat      wt  qsec vs am gear carb
1 21.0   6     160 110 3.90    2.62 16.46  0  1    4    4
2 21.0   6 missing 110 3.90   2.875 17.02  0  1    4    4
3 22.8   4     108  93 3.85    2.32 18.61  1  1    4    1
4 21.4   6     258 110 3.08 missing 19.44  1  0    3    1
5 18.7   8     360 175 3.15    3.44 17.02  0  0    3    2
6 18.1   6     225 105 2.76    3.46 20.22  1  0    3    1

  • there is an additional argument to set na strings (na)

xx <- read_excel("example.xlsx","conspvalues",na="missing")
head(xx)

mpg cyl disp  hp drat    wt  qsec vs am gear carb
1 21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
2 21.0   6   NA 110 3.90 2.875 17.02  0  1    4    4
3 22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
4 21.4   6  258 110 3.08    NA 19.44  1  0    3    1
5 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
6 18.1   6  225 105 2.76 3.460 20.22  1  0    3    1

  • with skip you can skip the first lines


xx <- read_excel("example.xlsx","conspvalues",skip=1)
head(xx)

21 6     160 110  3.9    2.62 16.46 0 1 4 4
1 21.0 6 missing 110 3.90   2.875 17.02 0 1 4 4
2 22.8 4     108  93 3.85    2.32 18.61 1 1 4 1
3 21.4 6     258 110 3.08 missing 19.44 1 0 3 1
4 18.7 8     360 175 3.15    3.44 17.02 0 0 3 2
5 18.1 6     225 105 2.76    3.46 20.22 1 0 3 1
6 14.3 8     360 245 3.21    3.57 15.84 0 0 3 4

  • and you can use col_names to tell the function whether or not the excel file containes the column names


xx <- read_excel("example.xlsx","conspvalues",skip=1, col_names=F)
head(xx)

X0 X1      X2  X3   X4      X5    X6 X7 X8 X9 X10
1 21.0  6     160 110 3.90    2.62 16.46  0  1  4   4
2 21.0  6 missing 110 3.90   2.875 17.02  0  1  4   4
3 22.8  4     108  93 3.85    2.32 18.61  1  1  4   1
4 21.4  6     258 110 3.08 missing 19.44  1  0  3   1
5 18.7  8     360 175 3.15    3.44 17.02  0  0  3   2
6 18.1  6     225 105 2.76    3.46 20.22  1  0  3   1


  • using col_types you can define the types of your columns (instead of letting R guess), attention: NA values may be produced
  • you need one name and type for each column


xx <- read_excel("example.xlsx","conspvalues",skip=1, col_types = rep("numeric",11))
head(xx)

Warnmeldungen:
1: In read_xlsx_(path, sheet, col_names = col_names, col_types = col_types,  :
  [3, 3]: expecting numeric: got 'missing'
2: In read_xlsx_(path, sheet, col_names = col_names, col_types = col_types,  :
  [5, 6]: expecting numeric: got 'missing'
    21 6 160 110  3.9  2.62 16.46 0 1 4 4
1 21.0 6  NA 110 3.90 2.875 17.02 0 1 4 4
2 22.8 4 108  93 3.85 2.320 18.61 1 1 4 1
3 21.4 6 258 110 3.08    NA 19.44 1 0 3 1
4 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
5 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
6 14.3 8 360 245 3.21 3.570 15.84 0 0 3 4

Sunday, March 15, 2015

function which runs two sample t-test on grouped data

1 function which runs two sample t-test on grouped data frame

  • the functions takes a data frame as its first argument
  • the second argument group is the column which should use for splitting the data frame
  • col indicates the numeric column to pass through to the t.test() function
  • incol should be a binary variable, which is also be passed through to t.test()


df.t.test <- function(df,group,col,indcol){
    t.test.helper <- function(x,col,indcol,group){
        tob <- t.test(x[,col] ~ x[,indcol])
        tmp <- data.frame(data = paste(col,"by",indcol),
                          group = x[1,group],
                          mean.group.1 = tob$estimate[1],
                          mean.group.2 = tob$estimate[2],
                          name.test.stat = tob$statistic,
                          conf.lower = tob$conf.int[1],
                          conf.upper = tob$conf.int[2],
                          pval = tob$p.value,
                          alternative = tob$alternative,
                          tob$method)
        names(tmp)[3:4] <- make.names(names(tob$estimate))
        row.names(tmp) <- x[1,group]
        tmp
    }
    df.l <- split(df[,c(col,indcol,group)],df[,group])
    Reduce(rbind,lapply(df.l,t.test.helper,col=col,indcol=indcol,group=group))}


## example data
examp.data <- data.frame(group=gl(10,100),
                         values=rnorm(1000),
                         t.group=sample(letters[1:2],1000,replace=T))
## example
df.t.test(examp.data,"group","values","t.group")

data group mean.in.group.a mean.in.group.b name.test.stat
1  values by t.group     1      0.06958824      0.02803721      0.2244456
2  values by t.group     2     -0.20944827     -0.06033410     -0.8368429
3  values by t.group     3     -0.20387479      0.07940850     -1.3245172
4  values by t.group     4      0.11406709      0.01975937      0.4220244
5  values by t.group     5      0.09060241     -0.10442099      1.0620544
6  values by t.group     6     -0.05623630     -0.07537593      0.1056388
7  values by t.group     7     -0.26081841     -0.02721652     -0.9533887
8  values by t.group     8     -0.04723535     -0.17205804      0.5662930
9  values by t.group     9      0.08185406      0.01676488      0.3033993
10 values by t.group    10     -0.41406196     -0.02193303     -2.1353113
   conf.lower  conf.upper       pval alternative              tob.method
1  -0.3263444  0.40944644 0.82293023   two.sided Welch Two Sample t-test
2  -0.5030591  0.20483073 0.40487276   two.sided Welch Two Sample t-test
3  -0.7083319  0.14176535 0.18876884   two.sided Welch Two Sample t-test
4  -0.3491642  0.53777963 0.67393371   two.sided Welch Two Sample t-test
5  -0.1693819  0.55942873 0.29082158   two.sided Welch Two Sample t-test
6  -0.3404432  0.37872241 0.91608663   two.sided Welch Two Sample t-test
7  -0.7200269  0.25282312 0.34281018   two.sided Welch Two Sample t-test
8  -0.3126875  0.56233286 0.57251104   two.sided Welch Two Sample t-test
9  -0.3606510  0.49082936 0.76222949   two.sided Welch Two Sample t-test
10 -0.7566487 -0.02760917 0.03527904   two.sided Welch Two Sample t-test