sim.nb.data <- function(m, l, n, ncom, P, prob.bin, odp){
library(mgcv)
library(SimSeq)
# lists for results
data.list <- list()
score.list <- list()
# RNA-Seq -data to compute mean read counts (to be sampled later)
data(kidney)
sample.data <- t(kidney$counts)
means <- apply(sample.data, 2, mean)
# latent variables from normal distribution
md<-as.data.frame(matrix(rnorm(l*n),nrow=n,ncol=l))
# boxplot(md)
ci <- list()
ci <- replicate(m, sample(1:l, rbinom(1,l,prob.bin),replace = FALSE))
# design matrix to define the relationships between latent variables and new genes
D <- matrix(0, nrow = m, ncol = l)
for(i in 1:m){
D[i, ci[[i]]] <- 1
}
A <- D%*%t(D)
# connections
d.up <- ifelse(upper.tri(A), A, 0)
con <- which(d.up!=0,arr.ind = T)
con.m <- con
# true positives
cond.pos <- matrix(0, ncol = m, nrow = m)
cond.pos <- ifelse(d.up>0, 1, 0)
pos.m <- cond.pos
# true negatives
cond.neg <- matrix(1, ncol = m, nrow = m)
cond.neg <- ifelse(upper.tri(cond.neg), 1, 0)
cond.neg <- cond.neg - cond.pos
neg.m <- cond.neg
for(p in 1:P){
cat("sim = ", iter <- p, "\n")
s <- matrix(0, m, m)
s.w <- matrix(0, m, m)
# empty matrix
data.sim <- matrix(NA, nrow = n, ncol = m)
x.new <- rep(NA, n)
xbeta <- rep(0, l)
# overdispersion
epsilon <- rgamma(n, shape = odp, scale = 1/odp)
for(k in 1:m){
beta.lp <- rep(0, l)
sign <- rep(1, length(ci[[k]]))
beta.lp[sort(ci[[k]])] <- sign
# sample expected values
# low counts not accepted (problems when fitting nb model)
mean.rc <- 0
while(mean.rc < 10){
mean.rc <- sample(means, 1)
}
mean.rc
for(j in 1:n){
# beta*x
for(g in 1:l){
xbeta[g] <- beta.lp[g]*md[j,g]
}
linpred <- sum(xbeta)
lambda <- epsilon[j]*exp(log(mean.rc)+linpred)
x.new[j] <- rpois(1,lambda)
}
data.sim[,k] <- x.new
}
data.list[[p]] <- data.sim
# scale data
data.sc <- scale(data.sim)
# PLS-components
standr <- function(x) x/as.numeric(sqrt(t(x)%*%x))
tot.c <- as.vector(apply(data.sim, 1, sum))
for(i in 1:m){
X <- data.sc[,-i]
y <- data.sc[,i]
c <- matrix(0, m-1, ncom)
TT <- matrix(0, nrow(data.sc), ncom)
c.var <- rep(0, ncom)
SS <- rep(0, ncom)
var.expl <- rep(0, ncom)
tX<- X
for(k in 1:ncom){ # construct ncom PLS components
c[,k] <- standr(t(tX)%*%y)
TT[,k] <- tX%*%c[,k]
c.var[k] <- (t(y)%*%TT[,k])/(t(TT[,k])%*%TT[,k])
SS[k] <- (t(TT[,k])%*%TT[,k])%*%(t(c.var[k])%*%c.var[k])
var.expl[k] <- SS[k] / sum(diag(t(y)%*%y))
tX <- tX-TT[,k]%*%solve(t(TT[,k])%*%TT[,k])%*%t(TT[,k])%*%tX
}
w <- var.expl / sum(var.expl) # weights
# glm with negative-binomial link
temp.data <- as.data.frame(cbind(data.sim[,i], TT, tot.c))
comp.names <- paste("C", 1:ncom, sep = '')
colnames(temp.data) <- c('y', comp.names, 't.counts')
names <- colnames(temp.data[2:(ncom+1)])
osa1 <- paste("y~")
osa2 <- paste(names, collapse = "+")
form <- paste(osa1, osa2, sep = "")
form2 <- as.formula(form)
mod <- gam(form2, offset=log(t.counts), family=nb(),data=temp.data)
summary(mod)
b <- as.matrix(coef(mod)[-1])
temp.beta <- c%*%b
if (i == 1){
s[i,] <- c(0,temp.beta)
} else {
if (i == m){
s[i,] <- c(temp.beta,0)
} else {
s[i,] <- c(temp.beta[1:(i-1)],0,temp.beta[i:(m-1)])
}
}
}
s <- 0.5*(s+t(s))
s <- s-0.5*(max(s)+min(s))
s <- s/(0.5*(max(s)-min(s)))
diag(s) <- 1
score.list[[p]] <- s
}
return(list(data.list = data.list, score.list = score.list,
pos.m = pos.m, neg.m = neg.m, con.m = con ))
}
# CALL FUNCTION
data1 <- sim.nb.data(100, 10, 20, 3, 1000, 0.05, 10)
data2 <- sim.nb.data(1000, 10, 20, 3, 1000, 0.05, 10)
data3 <- sim.nb.data(100, 10, 100, 3, 1000, 0.05, 10)
data4 <- sim.nb.data(1000, 10, 100, 3, 1000, 0.05, 10)
###################################################
# FUNCTION COMPUTING FDR WITH DIFFERENT LEVELS OF QVALUE
fdr.fun <- function(qvals, score, pos, neg){
# save the results
tp.data <- list()
sens.data <- list()
# condition positive
cond.pos <- pos
# condition negative
cond.neg <- neg
# loop over all selected qvalues
j <- 1
for(qvalue in qvals){
cat("qval = ", iter <- qvalue , "\n")
i <- 1
sens.q <- matrix(0, ncol = 5, nrow = length(score))
tp.q <- matrix(0, ncol = 5, nrow = length(score))
# loop over all generated data sets
for(d in 1:length(score)){
s <- score[[d]]
s.up <- s[upper.tri(s)]
m <- ncol(s)
b <- matrix(0, m, m)
out.density <- density(s.up, adjust = 1)
b[upper.tri(b, diag=FALSE)] <- (dnorm(s.up,mean(s.up),sd(s.up))/approx(out.density$x, out.density$y,
s.up, method = "linear")$y)
apu <- matrix(0, m, m)
apu[upper.tri(b, diag=FALSE)] <- 1
apu[which(b > qvalue, arr.ind = TRUE)] <- 0
fdr.m <- which(apu == 1, arr.ind=T)
# test positive
test.pos.fdr <- matrix(0, ncol = m, nrow = m)
test.pos.fdr[fdr.m] <- 1
# test negative
test.neg.fdr <- matrix(1, ncol = m, nrow = m)
test.neg.fdr <- ifelse(upper.tri(test.neg.fdr), 1, 0)
test.neg.fdr[fdr.m] <- 0
# true positives
tp.m.fdr <- cond.pos+test.pos.fdr
tp.fdr <- length(which(tp.m.fdr == 2))
# false positives
fp.m.fdr <- cond.neg + test.pos.fdr
fp.fdr <- length(which(fp.m.fdr == 2))
# true negatives
tn.m.fdr <- cond.neg + test.neg.fdr
tn.fdr <- length(which(tn.m.fdr == 2))
# false negatives
fn.m.fdr <- cond.pos + test.neg.fdr
fn.fdr <- length(which(fn.m.fdr == 2))
# sensitivity
if(tp.fdr+fn.fdr == 0){
sens.fdr <- 0
} else {
sens.fdr <- tp.fdr/(tp.fdr+fn.fdr)
}
# specificity
if(fp.fdr+tn.fdr == 0){
spec.fdr <- 0
} else {
spec.fdr <- tn.fdr/(fp.fdr+tn.fdr)
}
# true discovery rate
if(tp.fdr+fp.fdr == 0){
tdr.fdr <- 0
} else {
tdr.fdr <- tp.fdr/(tp.fdr+fp.fdr)
}
# true non-discovery rate
if(tn.fdr+fn.fdr == 0){
tnr.fdr <- 0
} else {
tnr.fdr <- tn.fdr/(tn.fdr+fn.fdr)
}
# save the results into ith row of a matrix
tp.q[i,1] <- qvalue
tp.q[i,2] <- tp.fdr
tp.q[i,3] <- fp.fdr
tp.q[i,4] <- tn.fdr
tp.q[i,5] <- fn.fdr
sens.q[i,1] <- qvalue
sens.q[i,2] <- sens.fdr
sens.q[i,3] <- spec.fdr
sens.q[i,4] <- tdr.fdr
sens.q[i,5] <- tnr.fdr
i <- i+1
}
tp.data[[j]] <- tp.q
sens.data[[j]] <- sens.q
j <- j+1
}
return(list(tp.list = tp.data, sens.list = sens.data ))
}
## CALL FUNCTION
qvals <- seq(0, 1, by = 0.05)
FDR1 <- fdr.fun(qvals, data1$score.list, data1$pos.m, data1$neg.m)
FDR2 <- fdr.fun(qvals, data2$score.list, data2$pos.m, data2$neg.m)
FDR3 <- fdr.fun(qvals, data3$score.list, data3$pos.m, data3$neg.m)
FDR4 <- fdr.fun(qvals, data4$score.list, data4$pos.m, data4$neg.m)
means1 <- lapply(FDR1$sens.list, colMeans)
means2fdr <- lapply(FDR2$sens.list, colMeans)
means3 <- lapply(FDR3$sens.list, colMeans)
means4 <- lapply(FDR4$sens.list, colMeans)
# corrected SEs, qvalue = 0.2
# # list with qval = 0.2
# P <- 100
# temp1 <- wFDR4$sens.list[[4]]
# temp.mean <- mean(temp1[,2])
# temp.mean
# var.sens <- var(temp1[,2])
# se.sens <- sqrt(1/P*(var.sens))
# round(se.sens, 3)
# var.spec <- var(temp1[,4])
# se.spec <- sqrt(1/P*(var.spec))
# round(se.spec, 3)
# var.tdr <- var(temp1[,6])
# se.tdr <- sqrt(1/P*(var.tdr))
# round(se.tdr, 3)
# var.tnr <- var(temp1[,8])
# se.tnr <- sqrt(1/P*(var.tnr))
# round(se.tnr, 3)