# PARAMETERS
# g = number of genes
# m = number of important genes
# n = number of samples
# rho = correlation
# ncom = number of PLS-components
# P = number of data sets to be generated
# g = 100
# m = 10
# n = 20
# rho = 0.9
# ncom = 3
# P = 2
sim.multin.data <- function(g, m, n, rho, ncom, P){
library(MASS)
library(mgcv)
library(SimSeq)
# lists for results
data.list <- list()
score.list <- list()
## PARAMETRIT MULTINORMAALIJAKAUMALLE
# covariance matrix
# only positive covariance elements
m1 <- diag(m)
for(i in 1:m){
for(j in 1:(i-1)){
if(i == j){
m1[i,j] <- 1
} else{
m1[i,j] <- rho
}
}
}
m2 <- t(m1) + m1
m3 <- diag(2, g)
m3[1:m, 1:m] <- m2
cov.m <- m3 - diag(g)
## EXPECTED VALUES
mu.v <- rep(0, g)
# CONDITION POSITIVE
cond.pos <- matrix(0, ncol = g, nrow = g)
cov.up <- ifelse(upper.tri(cov.m), cov.m, 0)
cond.pos[which(abs(cov.up) == rho, arr.ind = TRUE)] <- 1
pos.m <- cond.pos
# CONDITION NEGATIVE
cond.neg <- matrix(1, ncol = g, nrow = g)
cond.neg <- ifelse(upper.tri(cond.neg), 1, 0)
cond.neg <- cond.neg - cond.pos
neg.m <- cond.neg
# NUMBER OF CONNECTIONS
con <- m*(m-1)*0.5
## DATA for sampling total counts
data(kidney)
kidney.data <- t(kidney$counts)
## GENERATE TOTAL READ COUNT C FOR SUBJECT j FROM A DISTRIBUTION THAT IS REALISTIC
t.counts <- c()
for(j in 1:nrow(kidney.data)){
t.counts[j] <- sum(sample(kidney.data[j,], g, replace = TRUE))
}
for(p in 1:P){
cat("sim = ", iter <- p, "\n")
s <- matrix(0, g, g)
data.sim <- matrix(NA, nrow = n, ncol = g)
## GENERATE DATA
## MULTIVARIATE NORMAL DATA
mvn.m <- mvrnorm(n, mu.v, cov.m)/5
## EXP(X_i)
exp.mvn.m <- exp(mvn.m)
## GENERATE A VECTOR OF READ COUNTS FOR EACH SAMPLE FROM MULTINOMIAL DISTRIBUTION WITH PARAMETERS
## C (NUMBER OF TRIALS) AND SUCCESS PROPABILITIES lambda_j / lambda_tot
# lambda_tot = sum of lambda_i = sum of exp(x_i)
lambda.tot <- apply(exp.mvn.m, 1, sum)
# loop over subjects i, i = 1, ..., n
count.m <- matrix(0, nrow = n, ncol = g)
for(j in 1:n){
# resample C from t.counts
C <- sample(t.counts, 1, replace = TRUE) # t.counts are computed from random p genes
# succes probabilities for each gene for subject i and counts from multin. dist.
prob <- c()
# one sample (here are the \lambda_1 - \lambda_G)
sample <- exp.mvn.m[j,]
# probabilities: \lambda_1 / \lambda_tot from sample j
prob <- sample / lambda.tot[j]
count.m[j,] <- t(rmultinom(1, C, prob))
}
data.sim <- count.m
data.list[[p]] <- data.sim
# scale data
data.sc <- scale(data.sim)
# PLS-components
standr <- function(x) x/as.numeric(sqrt(t(x)%*%x))
tot.c <- as.vector(apply(data.sim, 1, sum))
for(i in 1:g){
X <- data.sc[,-i]
y <- data.sc[,i]
c <- matrix(0, g-1, ncom)
TT <- matrix(0, nrow(data.sc), ncom)
c.var <- rep(0, ncom)
SS <- rep(0, ncom)
var.expl <- rep(0, ncom)
tX<- X
for(k in 1:ncom){ # construct ncom PLS components
c[,k] <- standr(t(tX)%*%y)
TT[,k] <- tX%*%c[,k]
c.var[k] <- (t(y)%*%TT[,k])/(t(TT[,k])%*%TT[,k])
SS[k] <- (t(TT[,k])%*%TT[,k])%*%(t(c.var[k])%*%c.var[k])
var.expl[k] <- SS[k] / sum(diag(t(y)%*%y))
tX <- tX-TT[,k]%*%solve(t(TT[,k])%*%TT[,k])%*%t(TT[,k])%*%tX
}
w <- var.expl / sum(var.expl) # weights
# glm with negative-binomial link
temp.data <- as.data.frame(cbind(data.sim[,i], TT, tot.c))
comp.names <- paste("C", 1:ncom, sep = '')
colnames(temp.data) <- c('y', comp.names, 't.counts')
names <- colnames(temp.data[2:(ncom+1)])
osa1 <- paste("y~")
osa2 <- paste(names, collapse = "+")
form <- paste(osa1, osa2, sep = "")
form2 <- as.formula(form)
mod <- gam(form2, offset=log(t.counts), family=nb(),data=temp.data)
summary(mod)
b <- as.matrix(coef(mod)[-1])
temp.beta <- c%*%b
if (i == 1){
s[i,] <- c(0,temp.beta)
} else {
if (i == g){
s[i,] <- c(temp.beta,0)
} else {
s[i,] <- c(temp.beta[1:(i-1)],0,temp.beta[i:(g-1)])
}
}
}
# lopullinen symmetrisoitu ja skaalattu (max score = 1, min = -1) assosiaatiomatriisi
s <- 0.5*(s+t(s))
s <- s-0.5*(max(s)+min(s))
s <- s/(0.5*(max(s)-min(s)))
diag(s) <- 1
score.list[[p]] <- s
}
return(list(data.list = data.list, score.list = score.list,
pos.m = pos.m, neg.m = neg.m, con.m = con ))
}
# CALL FUNCTION
data1 <- sim.multin.data(100, 3, 20, 0.9, 3, 1000)
data2 <- sim.multin.data(1000, 10, 20, 0.9, 3, 100)
data3 <- sim.multin.data(100, 3, 100, 0.9, 3, 1000)
data4 <- sim.multin.data(1000, 10, 100, 0.9, 3, 100)
###################################################
# FUNCTION COMPUTING FDR WITH DIFFERENT LEVELS OF QVALUE
fdr.fun <- function(qvals, score, pos, neg){
# save the results
tp.data <- list()
sens.data <- list()
# condition positive
cond.pos <- pos
# condition negative
cond.neg <- neg
# loop over all selected qvalues
j <- 1
for(qvalue in qvals){
cat("qval = ", iter <- qvalue , "\n")
i <- 1
tp.q <- matrix(0, ncol = 5, nrow = length(score))
sens.q <- matrix(0, ncol = 5, nrow = length(score))
# loop over all generated data sets
for(d in 1:length(score)){
s <- score[[d]]
s.up <- s[upper.tri(s)]
m <- ncol(s)
b <- matrix(0, m, m)
out.density <- density(s.up, adjust = 2, from = -1, to = 1)
b[upper.tri(b, diag=FALSE)] <- (dnorm(s.up,mean(s.up),sd(s.up))/approx(out.density$x, out.density$y,
s.up, method = "linear")$y)
apu <- matrix(0, m, m)
apu[upper.tri(b, diag=FALSE)] <- 1
apu[which(b > qvalue, arr.ind = TRUE)] <- 0
fdr.m <- which(apu == 1, arr.ind=T)
# test positive
test.pos.fdr <- matrix(0, ncol = m, nrow = m)
test.pos.fdr[fdr.m] <- 1
# test negative
test.neg.fdr <- matrix(1, ncol = m, nrow = m)
test.neg.fdr <- ifelse(upper.tri(test.neg.fdr), 1, 0)
test.neg.fdr[fdr.m] <- 0
# true positives
tp.m.fdr <- cond.pos+test.pos.fdr
tp.fdr <- length(which(tp.m.fdr == 2))
# false positives
fp.m.fdr <- cond.neg + test.pos.fdr
fp.fdr <- length(which(fp.m.fdr == 2))
# true negatives
tn.m.fdr <- cond.neg + test.neg.fdr
tn.fdr <- length(which(tn.m.fdr == 2))
# false negatives
fn.m.fdr <- cond.pos + test.neg.fdr
fn.fdr <- length(which(fn.m.fdr == 2))
# sensitivity
if(tp.fdr+fn.fdr == 0){
sens.fdr <- 0
} else {
sens.fdr <- tp.fdr/(tp.fdr+fn.fdr)
}
# specificity
if(fp.fdr+tn.fdr == 0){
spec.fdr <- 0
} else {
spec.fdr <- tn.fdr/(fp.fdr+tn.fdr)
}
# true discovery rate
if(tp.fdr+fp.fdr == 0){
tdr.fdr <- 0
} else {
tdr.fdr <- tp.fdr/(tp.fdr+fp.fdr)
}
# true non-discovery rate
if(tn.fdr+fn.fdr == 0){
tnr.fdr <- 0
} else {
tnr.fdr <- tn.fdr/(tn.fdr+fn.fdr)
}
# save the results into ith row of a matrix
tp.q[i,1] <- qvalue
tp.q[i,2] <- tp.fdr
tp.q[i,3] <- fp.fdr
tp.q[i,4] <- tn.fdr
tp.q[i,5] <- fn.fdr
sens.q[i,1] <- qvalue
sens.q[i,2] <- sens.fdr
sens.q[i,3] <- spec.fdr
sens.q[i,4] <- tdr.fdr
sens.q[i,5] <- tnr.fdr
i <- i+1
}
tp.data[[j]] <- tp.q
sens.data[[j]] <- sens.q
j <- j+1
}
return(list(tp.list = tp.data, sens.list = sens.data ))
}
## CALL FUNCTION
qvals <- seq(0.0, 1.0, by = 0.05)
# no weights
FDR1 <- fdr.fun(qvals, data1$score.list, data1$pos.m, data1$neg.m)
FDR2 <- fdr.fun(qvals, data2$score.list, data2$pos.m, data2$neg.m)
FDR3 <- fdr.fun(qvals, data3$score.list, data3$pos.m, data3$neg.m)
FDR4 <- fdr.fun(qvals, data4$score.list, data4$pos.m, data4$neg.m)
## correction for SEs, qvalue = 0.2
# list with qval = 0.2
# P <- 100
# temp1 <- FDR4$sens.list[[4]]
# temp.mean <- mean(temp1[,2])
# temp.mean
# var.sens <- var(temp1[,2])
# se.sens <- sqrt(1/P*(var.sens))
# round(se.sens, 3)
# temp.mean <- mean(temp1[,4])
# temp.mean
# var.spec <- var(temp1[,4])
# se.spec <- sqrt(1/P*(var.spec))
# round(se.spec, 3)
# temp.mean <- mean(temp1[,6])
# temp.mean
# var.tdr <- var(temp1[,6])
# se.tdr <- sqrt(1/P*(var.tdr))
# round(se.tdr, 3)
# temp.mean <- mean(temp1[,8])
# temp.mean
# var.tnr <- var(temp1[,8])
# se.tnr <- sqrt(1/P*(var.tnr))
# round(se.tnr, 3)
# # hist(data1$score.list[[1]][upper.tri(data1$score.list[[1]])])
#
means1fdr <- lapply(FDR1$sens.list, colMeans)
means2 <- lapply(FDR2$sens.list, colMeans)
means3fdr <- lapply(FDR3$sens.list, colMeans)
means4 <- lapply(FDR4$sens.list, colMeans)