Sunday, July 7, 2013

R - annotation of a microarray platform

along with chapter 7 of Draghici: Statistics and Data Analysis for Microarrays Using R and Bioconductor

  • use the annotation() accessor to get the type of microarray used

annotation(Golub_Merge)

  • so we use the hu6800.db package, which provides detailed information about the hu6800 platform. The package is updated biannually.
  • install the package via biocLite("hu6800.db")
  • if you want to see what objects this package supports type

ls("package:hu6800.db")

[1] "hu6800"              "hu6800ACCNUM"        "hu6800ALIAS2PROBE"  
 [4] "hu6800CHR"           "hu6800CHRLENGTHS"    "hu6800CHRLOC"       
 [7] "hu6800CHRLOCEND"     "hu6800.db"           "hu6800_dbconn"      
[10] "hu6800_dbfile"       "hu6800_dbInfo"       "hu6800_dbschema"    
[13] "hu6800ENSEMBL"       "hu6800ENSEMBL2PROBE" "hu6800ENTREZID"     
[16] "hu6800ENZYME"        "hu6800ENZYME2PROBE"  "hu6800GENENAME"     
[19] "hu6800GO"            "hu6800GO2ALLPROBES"  "hu6800GO2PROBE"     
[22] "hu6800MAP"           "hu6800MAPCOUNTS"     "hu6800OMIM"         
[25] "hu6800ORGANISM"      "hu6800ORGPKG"        "hu6800PATH"         
[28] "hu6800PATH2PROBE"    "hu6800PFAM"          "hu6800PMID"         
[31] "hu6800PMID2PROBE"    "hu6800PROSITE"       "hu6800REFSEQ"       
[34] "hu6800SYMBOL"        "hu6800UNIGENE"       "hu6800UNIPROT"

  • to get the mappings between manufacturer identifiers and gene abbreviations you can use hu6800SYMBOL and mget()
mget(ls(hu6800SYMBOL)[1:5],hu6800SYMBOL )
$A28102_at
[1] "GABRA3"

$AB000114_at
[1] "OMD"

$AB000115_at
[1] "IFI44L"

$AB000220_at
[1] "SEMA3C"

$AB000381_s_at
[1] "GML"
  • you can also use the DBI package infrastructure
    • list the tables available for NCBI Entrez Gene metadata

require(DBI)
hh <- org.Hs.eg_dbconn()
dbListTables(hh)
[1] "accessions"            "alias"                 "chrlengths"           
 [4] "chromosome_locations"  "chromosomes"           "cytogenetic_locations"
 [7] "ec"                    "ensembl"               "ensembl2ncbi"         
[10] "ensembl_prot"          "ensembl_trans"         "gene_info"            
[13] "genes"                 "go"                    "go_all"               
[16] "go_bp"                 "go_bp_all"             "go_cc"                
[19] "go_cc_all"             "go_mf"                 "go_mf_all"            
[22] "kegg"                  "map_counts"            "map_metadata"         
[25] "metadata"              "ncbi2ensembl"          "omim"                 
[28] "pfam"                  "prosite"               "pubmed"               
[31] "refseq"                "sqlite_stat1"          "ucsc"                 
[34] "unigene"               "uniprot"
  • query for genes annotated as sarcoma oncogenes
SQL <- "select * from gene_info gi, genes g where gi._id=g._id and gene_name like '%arcoma%oncogene%'"
sops <- dbGetQuery(hh,SQL)
head(sops)
_id                                                     gene_name symbol
1  314              v-raf murine sarcoma 3611 viral oncogene homolog   ARAF
2  559                v-raf murine sarcoma viral oncogene homolog B1   BRAF
3 1156             v-crk sarcoma virus CT10 oncogene homolog (avian)    CRK
4 1157        v-crk sarcoma virus CT10 oncogene homolog (avian)-like   CRKL
5 1834                                       feline sarcoma oncogene    FES
6 1859 Gardner-Rasheed feline sarcoma viral (v-fgr) oncogene homolog    FGR
   _id gene_id
1  314     369
2  559     673
3 1156    1398
4 1157    1399
5 1834    2242
6 1859    2268
  • find the LYN gene
sops[grep("LYN",sops$symbol),]
  • so we see it's in the 13th row and the gene id is 4067
sops[grep("LYN",sops$symbol),]
  • the we can use get() to get the probe set identifier
get("4067",revmap(hu6800ENTREZID))
[1] "M16038_at"
  • display the expression data of the gene by disease group
boxplot(exprs(Golub_Merge)["M16038_at",]~Golub_Merge$ALL.AML,col=c("darkgreen","gold"),notch=T)

  • and the ggplot2 version
require(ggplot2)
gmdata <- data.frame(expr=exprs(Golub_Merge)["M16038_at",],ALL.AML=Golub_Merge$ALL.AML)
ggplot(gmdata,aes(x=ALL.AML,y=expr,fill=ALL.AML)) +
    geom_boxplot(notch=T) +
    scale_fill_manual(values=c("darkgreen","gold")) +
    xlab("")
ggsave("ggboxpl.png")

No comments :

Post a Comment