Report

Ali Oghabian

2019-04-25

The topics covered in this document are:

Files in the zip

The files in the zip/folder include:

R and IntEREst version

Please note the R version and the IntEREst version used for these analysis:

R.Version()
## $platform
## [1] "x86_64-pc-linux-gnu"
## 
## $arch
## [1] "x86_64"
## 
## $os
## [1] "linux-gnu"
## 
## $system
## [1] "x86_64, linux-gnu"
## 
## $status
## [1] "Under development (unstable)"
## 
## $major
## [1] "3"
## 
## $minor
## [1] "6.0"
## 
## $year
## [1] "2018"
## 
## $month
## [1] "05"
## 
## $day
## [1] "02"
## 
## $`svn rev`
## [1] "74682"
## 
## $language
## [1] "R"
## 
## $version.string
## [1] "R Under development (unstable) (2018-05-02 r74682)"
## 
## $nickname
## [1] "Unsuffered Consequences"
packageVersion("IntEREst")
## [1] '1.7.5'


Annotating u12 type introns of HG38

Initially, a reference data frame was produced from ENSEMBL HG38 (GRCh38.p12). Note that in order to be able to run the R scripts in the document you should set the R working directory to the folder that contains the extrcated files form the zip file.


# Time demanding
ens38Uncol<- referencePrepare (sourceBuild="biomaRt", 
    biomart="ENSEMBL_MART_ENSEMBL",
    biomartDataset="hsapiens_gene_ensembl",
    biomartHost="www.ensembl.org", 
    ignore.strand=FALSE, 
    annotateGeneIds=TRUE, 
    collapseExons=FALSE)


Next the U12-type introns were annotated using donor socre threoshold 0.07 and branch point score threshold 0.14. If an intron was not annotated as a U12-type or a U2-type we considered it as U2-type (i.e. the default parameter setting setNaAs= "U2" was used).

# Time demanding
library(BSgenome.Hsapiens.UCSC.hg38)
# Building PWM
pwmPaste<-buildSsTypePwms(u12DonorBegin=13, u2DonorBegin=11, 
    u2AcceptorBegin=38, u12DonorEnd=17, u2DonorEnd=17, 
    u2AcceptorEnd=40, pasteSites=TRUE)
save(pwmPaste, file="./pwmPaste.rda")

# Ignore 3' end nucleotide
# Annotations will be based on Donor and BP scores
accPasteScore=as.matrix(pwmPaste[[3]])
for(i in 1:nrow(accPasteScore))accPasteScore[i,1]=0

# Modify the chr names of the reference data.frame

if(length(grep("^chr",ens38Uncol$chr))==0)
    ens38Uncol$chr<- paste("chr", ens38Uncol$chr, sep="")

accPasteScore=as.matrix(pwmPaste[[3]])
for(i in 1:nrow(accPasteScore))accPasteScore[i,1]=0

ens38UncolMod<- ens38Uncol[which(ens38Uncol$chr %in% 
    paste("chr", c(1:22,"X","Y", "MT"), sep="")),]
ens38UncolMod[which(ens38UncolMod$chr == "chrMT"),]<- "chrM"
save(ens38UncolMod, file="./ens38UncolMod.rda")

# Annotate U12-type introns using donor and BP of U12- and Donor and acceptor 
# sites of U2-type introns.

ensHg38UncolFilpaste60AnnoMat_40_don07_bp14<- annotateU12(
    pwmU12U2= list(
        pwmPaste[[1]],
        pwmPaste[[2]],
        accPasteScore,
        pwmPaste[[4]],
        pwmPaste[[5]]),
    pwmSsIndex= list(
        indexDonU12=1, 
        indexBpU12=1, 
        indexAccU12=1, 
        indexDonU2=1, 
        indexAccU2=1), 
    referenceChr= ens38UncolMod[,"chr"], 
    referenceBegin= as.numeric(ens38UncolMod[,"begin"]), 
    referenceEnd= as.numeric(ens38UncolMod[,"end"]), 
    referenceIntronExon= ens38UncolMod[,"int_ex"],
    intronExon= "intron",
    matchWindowRelativeUpstreamPos= c(2,-40,NA,NA,NA),
    matchWindowRelativeDownstreamPos= c(6,-3,NA,NA,NA), 
    minMatchScore= c( .07, .14, 0, .07,  0), 
    refGenome= BSgenome.Hsapiens.UCSC.hg38, 
    setNaAs= "U2", 
    annotateU12Subtype= TRUE,
    includeMatchScores= TRUE)

save(ensHg38UncolFilpaste60AnnoMat_40_don07_bp14, 
    file="./ensHg38UncolFilpaste60AnnoMat_40_don07_bp14.rda")


The number of the detected U12-type introns are as follows:

load(file="./ensHg38UncolFilpaste60AnnoMat_40_don07_bp14.rda")

# Number of U12-type intron
table(ensHg38UncolFilpaste60AnnoMat_40_don07_bp14$int_type)
## 
##     U12      U2 
##    3528 1051845
# Number of unique U12-type intron
indU12<- which(ensHg38UncolFilpaste60AnnoMat_40_don07_bp14$int_type=="U12")
length(
    unique(paste(ens38UncolMod[indU12,"chr"], 
        ens38UncolMod[indU12,"begin"], ens38UncolMod[indU12,"end"], sep="_")))
## [1] 826
# Number of U12-type intron subtypes
table(ensHg38UncolFilpaste60AnnoMat_40_don07_bp14$u12_subtype)
## 
## AT-AC GT-AG 
##   903  2625
# Number of unique U12-type intron subtypes
uniCoord<- unique(paste(ens38UncolMod[indU12,"chr"], 
        ens38UncolMod[indU12,"begin"], ens38UncolMod[indU12,"end"], sep="_"))
coord<- paste(ens38UncolMod[indU12,"chr"], 
        ens38UncolMod[indU12,"begin"], ens38UncolMod[indU12,"end"], sep="_")
indUni<-match(uniCoord, coord)
table(ensHg38UncolFilpaste60AnnoMat_40_don07_bp14$u12_subtype[indU12[indUni]])
## 
## AT-AC GT-AG 
##   195   631


ncRNAs with U12-type introns

To detect the ncRNAs, initially various annotations of ENSEMBL genes, e.g. extrenal gene name and biotype (description), were downloaded using Using biomaRt.

ensembl<- biomaRt::useMart("ENSEMBL_MART_ENSEMBL", 
    host="www.ensembl.org", dataset="hsapiens_gene_ensembl")

annoFil<- biomaRt::getBM( attributes=c("ensembl_gene_id", 
        "ensembl_transcript_id", "external_gene_name", "gene_biotype", 
        "transcript_biotype"), 
    filters = "ensembl_transcript_id", 
    values = unique(ens38UncolMod$transcript_id), 
    mart=ensembl, uniqueRows = TRUE)

save(annoFil, file="./annoFil.rda")


indPick<- which((transcript_biotype=="lincRNA" | 
    transcript_biotype=="bidirectional_promoter_lncRNA") & 
    ensHg38UncolFilpaste60AnnoMat_40_don07_bp14$int_type=="U12")
U12Linc<-ens38UncolMod[indPick,]

donPos=Biostrings::getSeq(BSgenome.Hsapiens.UCSC.hg38, 
                    names=U12Linc[U12Linc[,4]=="+",1],
                    start=
                        as.numeric(U12Linc[U12Linc[,4]=="+",2]),
                    end=
                        as.numeric(U12Linc[U12Linc[,4]=="+",2])+9,
                    as.character=TRUE)
accNeg=Biostrings::getSeq(BSgenome.Hsapiens.UCSC.hg38, 
                    names=U12Linc[U12Linc[,4]=="-",1],
                    start=
                        as.numeric(U12Linc[U12Linc[,4]=="-",2]),
                    end=
                        as.numeric(U12Linc[U12Linc[,4]=="-",2])+39,
                    as.character=TRUE)

accPos=Biostrings::getSeq(BSgenome.Hsapiens.UCSC.hg38, 
                    names=U12Linc[U12Linc[,4]=="+",1],
                    start=as.numeric(U12Linc[U12Linc[,4]=="+",3])-39,
                    end=as.numeric(U12Linc[U12Linc[,4]=="+",3])
                        ,
                    as.character=TRUE)
donNeg=Biostrings::getSeq(BSgenome.Hsapiens.UCSC.hg38, 
                    names=U12Linc[U12Linc[,4]=="-",1],
                    start=as.numeric(U12Linc[U12Linc[,4]=="-",3])-9,
                    end=as.numeric(U12Linc[U12Linc[,4]=="-",3]),
                    as.character=TRUE)

donNeg<- as.vector(
    as.character(Biostrings::reverseComplement(DNAStringSet(donNeg))))
accNeg<- as.vector(
    as.character(Biostrings::reverseComplement(DNAStringSet(accNeg))))
accPos<- as.vector(accPos)
donPos<- as.vector(donPos)

donorSeq<- rep("",nrow(U12Linc))
acceptorSeq<- rep("",nrow(U12Linc))
donorSeq[U12Linc[,4]=="+"]<-donPos
donorSeq[U12Linc[,4]=="-"]<-donNeg
acceptorSeq[U12Linc[,4]=="+"]<-accPos
acceptorSeq[U12Linc[,4]=="-"]<-accNeg
U12LincMat<- cbind(U12Linc, donorSeq, acceptorSeq)

# Write lincRNA U12-type introns 
write.table(U12LincMat, file="./U12LncRNA.tsv", 
    col.names=T, row.names=F, sep='\t', quote=F)
# write data frame with info for all introns
write.table(
    cbind(ens38UncolMod, external_gene_name, 
        ensHg38UncolFilpaste60AnnoMat_40_don07_bp14, 
        transcript_biotype),
    file="./allIntronsU12annotationGeneType.tsv",
    col.names=T, row.names=F, sep='\t', quote=F)

# Build datat frame with U12 type introns info
u12Out<- 
    cbind(ens38UncolMod, ensHg38UncolFilpaste60AnnoMat_40_don07_bp14,
            external_gene_name, 
            transcript_biotype)[which(
                ensHg38UncolFilpaste60AnnoMat_40_don07_bp14$int_type=="U12"),]

donPos=Biostrings::getSeq(BSgenome.Hsapiens.UCSC.hg38, 
                    names=u12Out[u12Out[,4]=="+",1],
                    start=
                        as.numeric(u12Out[u12Out[,4]=="+",2]),
                    end=
                        as.numeric(u12Out[u12Out[,4]=="+",2])+9,
                    as.character=TRUE)
accNeg=Biostrings::getSeq(BSgenome.Hsapiens.UCSC.hg38, 
                    names=u12Out[u12Out[,4]=="-",1],
                    start=
                        as.numeric(u12Out[u12Out[,4]=="-",2]),
                    end=
                        as.numeric(u12Out[u12Out[,4]=="-",2])+39,
                    as.character=TRUE)

accPos=Biostrings::getSeq(BSgenome.Hsapiens.UCSC.hg38, 
                    names=u12Out[u12Out[,4]=="+",1],
                    start=as.numeric(u12Out[u12Out[,4]=="+",3])-39,
                    end=as.numeric(u12Out[u12Out[,4]=="+",3])
                        ,
                    as.character=TRUE)
donNeg=Biostrings::getSeq(BSgenome.Hsapiens.UCSC.hg38, 
                    names=u12Out[u12Out[,4]=="-",1],
                    start=as.numeric(u12Out[u12Out[,4]=="-",3])-9,
                    end=as.numeric(u12Out[u12Out[,4]=="-",3]),
                    as.character=TRUE)

donNeg<- 
as.vector(as.character(Biostrings::reverseComplement(DNAStringSet(donNeg))))
accNeg<- 
as.vector(as.character(Biostrings::reverseComplement(DNAStringSet(accNeg))))
accPos<- as.vector(accPos)
donPos<- as.vector(donPos)

donorSeq<- rep("",nrow(u12Out))
acceptorSeq<- rep("",nrow(u12Out))
donorSeq[u12Out[,4]=="+"]<-donPos
donorSeq[u12Out[,4]=="-"]<-donNeg
acceptorSeq[u12Out[,4]=="+"]<-accPos
acceptorSeq[u12Out[,4]=="-"]<-accNeg
u12OutSeq<- cbind(u12Out, donorSeq, acceptorSeq)

write.table(
    u12OutSeq,
    file="./AllU12Introns.tsv",
    col.names=T, row.names=F, sep='\t', quote=F)
write.table(
    u12OutSeq[which(u12OutSeq$transcript_biotype!="protein_coding"),],
    file="./nonProtCodingU12Introns.tsv",
    col.names=T, row.names=F, sep='\t', quote=F)


U12 annotation comparison

Changes in the parameter settings of the annotateU12() function may lead to detection of different sets of U12-type introns. Here, we compare the U12-type introns detected from the same annotateU12() run as mentioned in this report (on a reference built from HG19) to those reported by Merico, D. et al.


Detected U12 type introns for HG19 compared to those detected by merico, D. *et al*.

Detected U12 type introns for HG19 compared to those detected by merico, D. et al.


Finally

The following scripts were used in R to generate the suppl1.html file from the suppl1.Rmd file.

library("rmarkdown")
render("./report.Rmd")


Reference