## SIMSEQ: INDUCED COVARIANCE STRUCTURE
# n.group <- 5
# n.genes <- 10
# p <- 500
# n <- 100
# ncom <- 3
simseq.data <- function(n, p, n.group, n.genes, ncom, P){
library(SimSeq)
library(mgcv)
# lists for results
data.list <- list()
score.list <- list()
# pars
de.n <- n.group*n.genes
ee.n <- p-de.n
# INPUT DATA
data(kidney)
counts <- kidney$counts # Matrix of read counts from KIRC dataset
treatment <- kidney$treatment # Treatment vector indicating Non-Tumor or Tumor columns
# mean read count > 10
nonzero <- rowMeans(counts) > 10
counts2 <- counts[nonzero,]
# vahintaan kymmenessa sarakkeessa read count
more2 <- rowSums(counts2 != 0) > 100
nonzero.data <- counts2[more2,]
# transpose
t.nonzero <- t(nonzero.data)
#################################################
# sample randomly 50 genes to be the differentially expressed genes
ind.de <- sort(sample(1:ncol(t.nonzero), de.n))
# make a list of groups
group.l <- split(ind.de, ceiling(seq_along(ind.de)/n.genes))
# replicate data
rep.data <- rbind(t.nonzero, t.nonzero, t.nonzero, t.nonzero, t.nonzero)
# trt group
rep.trt <- c(treatment, treatment, treatment, treatment, treatment)
# sample id
rep.replic <- rep(1:(nrow(rep.data)/2), each = 2)
# matrix of the same size as t.nonzero
# ind.de columns have nonzero values, rest are zeros
add.m <- matrix(0, nrow=nrow(rep.data), ncol = ncol(rep.data))
lambdas <- rep(0, n.group)
for(j in 1:n.group){
for(i in 1:nrow(rep.data)){
add.m[i, group.l[[j]]] <- sample(c(rep.data[,group.l[[j]]]), 1)
}
}
source.data <- rep.data+add.m
source.data2 <- source.data
for(j in 1:n.group){
source.data2[, group.l[[j]]] <- round(source.data[,group.l[[j]]]/2, 0)
}
# all genes
genes <- sort(c(sample((1:ncol(rep.data))[-ind.de], (p-de.n)), ind.de))
diff.ind <- match(sort(ind.de), sort(genes))
# interactions (within each group of DE-genes)
split.diff <- split(diff.ind, ceiling(seq_along(diff.ind)/n.genes))
cons <- lapply(split.diff, FUN=combn, 2 )
t.cons <- lapply(cons, FUN=t)
pair.con <- do.call(rbind, t.cons)
# equally expressed
ee.genes <- genes[-diff.ind]
# true positives
cond.pos <- matrix(0, ncol = p, nrow = p)
cond.pos[pair.con] <- 1
pos.m <- cond.pos
# true negatives
cond.neg <- matrix(1, ncol = p, nrow = p)
cond.neg <- ifelse(upper.tri(cond.neg), 1, 0)
cond.neg <- cond.neg - cond.pos
neg.m <- cond.neg
# normalization factor
nf <- apply(source.data, 1, quantile, 0.75)
for(sim.round in 1:P){
cat("sim = ", iter <- sim.round, "\n")
s <- matrix(0, p, p)
# generate data
data.sim <- SimData(counts = t(source.data2), treatment = rep.trt, sort.method = "paired",
replic = rep.replic,
k.ind = (n/2), genes.select = genes, genes.diff = ind.de,
samp.independent = FALSE, norm.factors = nf)
sim.counts <- data.sim$counts
t.sim.counts <- t(sim.counts)
boxplot(t.sim.counts)
data.list[[sim.round]] <- t.sim.counts
# scale data
data.sc <- scale(t.sim.counts)
# PLS-components
standr <- function(x) x/as.numeric(sqrt(t(x)%*%x))
tot.c <- as.vector(apply(t.sim.counts, 1, sum))
for(i in 1:p){
X <- data.sc[,-i]
y <- data.sc[,i]
c <- matrix(0, p-1, ncom)
TT <- matrix(0, nrow(data.sc), ncom)
c.var <- rep(0, ncom)
SS <- rep(0, ncom)
var.expl <- rep(0, ncom)
tX<- X
for(k in 1:ncom){ # construct ncom PLS components
c[,k] <- standr(t(tX)%*%y)
TT[,k] <- tX%*%c[,k]
tX <- tX-TT[,k]%*%solve(t(TT[,k])%*%TT[,k])%*%t(TT[,k])%*%tX
}
# glm with negative-binomial link
temp.data <- as.data.frame(cbind(t.sim.counts[,i], TT, tot.c))
comp.names <- paste("C", 1:ncom, sep = '')
colnames(temp.data) <- c('y', comp.names, 't.counts')
names <- colnames(temp.data[2:(ncom+1)])
osa1 <- paste("y~")
osa2 <- paste(names, collapse = "+")
form <- paste(osa1, osa2, sep = "")
form2 <- as.formula(form)
mod <- gam(form2, offset=log(t.counts), family=nb(),data=temp.data)
summary(mod)
b <- as.matrix(coef(mod)[-1])
temp.beta <- c%*%b
if (i == 1){
s[i,] <- c(0,temp.beta)
} else {
if (i == p){
s[i,] <- c(temp.beta,0)
} else {
s[i,] <- c(temp.beta[1:(i-1)],0,temp.beta[i:(p-1)])
}
}
}
s <- 0.5*(s+t(s))
s <- s-0.5*(max(s)+min(s))
s <- s/(0.5*(max(s)-min(s)))
diag(s) <- 1
score.list[[sim.round]] <- s
}
return(list(data.list = data.list,
score.list = score.list,
pos.m = pos.m, neg.m = neg.m))
}
data1 <- simseq.data(20, 100, 5, 5, 3, 1000)
data2 <- simseq.data(20, 1000, 5, 15, 3, 1000)
data3 <- simseq.data(100, 100, 5, 5, 3, 1000)
data4 <- simseq.data(100, 1000, 5, 15, 3, 1000)
## FDR
fdr.fun <- function(qvals, score, pos, neg){
# save the results
tp.data <- list()
sens.data <- list()
fp.ind.l <- list()
# condition positive
cond.pos <- pos
# condition negative
cond.neg <- neg
# loop over all selected qvalues
j <- 1
for(qvalue in qvals){
cat("qval = ", iter <- qvalue , "\n")
i <- 1
sens.q <- matrix(0, ncol = 5, nrow = length(score))
tp.q <- matrix(0, ncol = 5, nrow = length(score))
# loop over all generated data sets
for(d in 1:length(score)){
s <- score[[d]]
s.up <- s[upper.tri(s)]
m <- ncol(s)
b <- matrix(0, m, m)
out.density <- density(s.up, adjust = 1)
#out.density <- density(s.up,bw="nrd")
b[upper.tri(b, diag=FALSE)] <- (dnorm(s.up,mean(s.up),sd(s.up))/approx(out.density$x, out.density$y,
s.up, method = "linear")$y)
apu <- matrix(0, m, m)
apu[upper.tri(b, diag=FALSE)] <- 1
apu[which(b > qvalue, arr.ind = TRUE)] <- 0
fdr.m <- which(apu == 1, arr.ind=T)
# test positive
test.pos.fdr <- matrix(0, ncol = m, nrow = m)
test.pos.fdr[fdr.m] <- 1
# test negative
test.neg.fdr <- matrix(1, ncol = m, nrow = m)
test.neg.fdr <- ifelse(upper.tri(test.neg.fdr), 1, 0)
test.neg.fdr[fdr.m] <- 0
# true positives
tp.m.fdr <- cond.pos+test.pos.fdr
tp.fdr <- length(which(tp.m.fdr == 2))
# false positives
fp.m.fdr <- cond.neg + test.pos.fdr
fp.fdr <- length(which(fp.m.fdr == 2))
fp.ind <- which(fp.m.fdr ==2, arr.ind = TRUE)
fp.ind.l[[d]] <- fp.ind
# true negatives
tn.m.fdr <- cond.neg + test.neg.fdr
tn.fdr <- length(which(tn.m.fdr == 2))
# false negatives
fn.m.fdr <- cond.pos + test.neg.fdr
fn.fdr <- length(which(fn.m.fdr == 2))
# sensitivity
if(tp.fdr+fn.fdr == 0){
sens.fdr <- 0
} else {
sens.fdr <- tp.fdr/(tp.fdr+fn.fdr)
}
# specificity
if(fp.fdr+tn.fdr == 0){
spec.fdr <- 0
} else {
spec.fdr <- tn.fdr/(fp.fdr+tn.fdr)
}
# true discovery rate
if(tp.fdr+fp.fdr == 0){
tdr.fdr <- 0
} else {
tdr.fdr <- tp.fdr/(tp.fdr+fp.fdr)
}
# true non-discovery rate
if(tn.fdr+fn.fdr == 0){
tnr.fdr <- 0
} else {
tnr.fdr <- tn.fdr/(tn.fdr+fn.fdr)
}
# save the results into ith row of a matrix
tp.q[i,1] <- qvalue
tp.q[i,2] <- tp.fdr
tp.q[i,3] <- fp.fdr
tp.q[i,4] <- tn.fdr
tp.q[i,5] <- fn.fdr
sens.q[i,1] <- qvalue
sens.q[i,2] <- sens.fdr
sens.q[i,3] <- spec.fdr
sens.q[i,4] <- tdr.fdr
sens.q[i,5] <- tnr.fdr
i <- i+1
}
tp.data[[j]] <- tp.q
sens.data[[j]] <- sens.q
j <- j+1
}
return(list(tp.list = tp.data, sens.list = sens.data, fp.ind.l = fp.ind.l ))
}
## CALL FUNCTION
qvals <- seq(0, 1, by = 0.05)
FDR1 <- fdr.fun(qvals, data1$score.list, data1$pos.m, data1$neg.m)
FDR2 <- fdr.fun(qvals, data2$score.list, data2$pos.m, data2$neg.m)
FDR3 <- fdr.fun(qvals, data3$score.list, data3$pos.m, data3$neg.m)
FDR4 <- fdr.fun(qvals, data4$score.list, data4$pos.m, data4$neg.m)
means1 <- lapply(FDR1$sens.list, colMeans)
means2 <- lapply(FDR2$sens.list, colMeans)
means3 <- lapply(FDR3$sens.list, colMeans)
means4 <- lapply(FDR4$sens.list, colMeans)