R code for Multinormal Simulation

 

# PARAMETERS

# g = number of genes

# m = number of important genes

# n = number of samples

# rho = correlation

# ncom = number of PLS-components

# P = number of data sets to be generated

 

# g = 100

# m = 10

# n = 20

# rho = 0.9

# ncom = 3

# P = 2

 

sim.multin.data <- function(g, m, n, rho, ncom, P){

 

  library(MASS)

  library(mgcv)

  library(SimSeq)

 

  # lists for results

  data.list <- list()

  score.list <- list()

 

  ## PARAMETRIT MULTINORMAALIJAKAUMALLE

 

  # covariance matrix

  # only positive covariance elements

  m1 <- diag(m)

  for(i in 1:m){

    for(j in 1:(i-1)){

      if(i == j){

        m1[i,j] <- 1

      } else{

       

        m1[i,j] <- rho

       

      }

    }

  }

 

  m2 <- t(m1) + m1

  m3 <- diag(2, g)

  m3[1:m, 1:m] <- m2

  cov.m <- m3 - diag(g)

 

  ## EXPECTED VALUES

  mu.v <- rep(0, g) 

 

  # CONDITION POSITIVE

  cond.pos <- matrix(0, ncol = g, nrow = g)

  cov.up <- ifelse(upper.tri(cov.m), cov.m, 0)

  cond.pos[which(abs(cov.up) == rho, arr.ind = TRUE)] <- 1

  pos.m <- cond.pos

 

  # CONDITION NEGATIVE

  cond.neg <- matrix(1, ncol = g, nrow = g)

  cond.neg <- ifelse(upper.tri(cond.neg), 1, 0)

  cond.neg <- cond.neg - cond.pos

  neg.m <- cond.neg

 

  # NUMBER OF CONNECTIONS

  con <- m*(m-1)*0.5

 

  ## DATA for sampling total counts

  data(kidney)

  kidney.data <- t(kidney$counts)

 

  ## GENERATE TOTAL READ COUNT C FOR SUBJECT j FROM A DISTRIBUTION THAT IS REALISTIC

  t.counts <- c()

  for(j in 1:nrow(kidney.data)){

    t.counts[j] <- sum(sample(kidney.data[j,], g, replace = TRUE))

  }

 

  for(p in 1:P){

   

    cat("sim = ", iter <- p, "\n")

   

    s <- matrix(0, g, g)

    data.sim <- matrix(NA, nrow = n, ncol = g)

 

    ## GENERATE DATA

   

    ## MULTIVARIATE NORMAL DATA

    mvn.m <-  mvrnorm(n, mu.v, cov.m)/5

   

    ## EXP(X_i)

    exp.mvn.m <- exp(mvn.m)

   

    ## GENERATE A VECTOR OF READ COUNTS FOR EACH SAMPLE FROM MULTINOMIAL DISTRIBUTION WITH PARAMETERS

    ## C (NUMBER OF TRIALS) AND SUCCESS PROPABILITIES lambda_j / lambda_tot

   

    # lambda_tot = sum of lambda_i = sum of exp(x_i)

    lambda.tot <- apply(exp.mvn.m, 1, sum)

   

    # loop over subjects i, i = 1, ..., n

    count.m <- matrix(0, nrow = n, ncol = g)

   

    for(j in 1:n){

      # resample C from t.counts

      C <- sample(t.counts, 1, replace = TRUE) # t.counts are computed from random p genes

      # succes probabilities for each gene for subject i and counts from multin. dist.

      prob <- c()

      # one sample (here are the \lambda_1 - \lambda_G)

      sample <- exp.mvn.m[j,]

      # probabilities: \lambda_1 / \lambda_tot from sample j

      prob <- sample / lambda.tot[j]

     

      count.m[j,] <- t(rmultinom(1, C, prob))

    }

   

    data.sim <- count.m

 

    data.list[[p]] <- data.sim

   

    # scale data

    data.sc <- scale(data.sim)

   

    # PLS-components

    standr <- function(x) x/as.numeric(sqrt(t(x)%*%x))

   

    tot.c <- as.vector(apply(data.sim, 1, sum))

   

    for(i in 1:g){

     

      X <- data.sc[,-i]

      y <- data.sc[,i]

     

      c <- matrix(0, g-1, ncom)

      TT <- matrix(0, nrow(data.sc), ncom)

     

      c.var <- rep(0, ncom)

      SS <- rep(0, ncom)

      var.expl <- rep(0, ncom)

     

      tX<- X

      for(k in 1:ncom){ # construct ncom PLS components

        c[,k] <- standr(t(tX)%*%y)

        TT[,k] <- tX%*%c[,k]

       

        c.var[k] <- (t(y)%*%TT[,k])/(t(TT[,k])%*%TT[,k])

        SS[k] <- (t(TT[,k])%*%TT[,k])%*%(t(c.var[k])%*%c.var[k])

        var.expl[k] <- SS[k] / sum(diag(t(y)%*%y))

       

        tX <- tX-TT[,k]%*%solve(t(TT[,k])%*%TT[,k])%*%t(TT[,k])%*%tX

      }

     

      w <- var.expl / sum(var.expl) # weights

     

      # glm with negative-binomial link

     

      temp.data <- as.data.frame(cbind(data.sim[,i], TT, tot.c))

      comp.names <- paste("C", 1:ncom, sep = '')

      colnames(temp.data) <- c('y', comp.names, 't.counts')

     

      names <- colnames(temp.data[2:(ncom+1)])

      osa1 <- paste("y~")

      osa2 <- paste(names, collapse = "+")

      form <- paste(osa1, osa2, sep = "")

      form2 <- as.formula(form)

     

      mod <- gam(form2, offset=log(t.counts), family=nb(),data=temp.data)

      summary(mod)

     

      b <- as.matrix(coef(mod)[-1])

     

      temp.beta <- c%*%b

     

      if (i == 1){

        s[i,] <- c(0,temp.beta)

      } else {

        if (i == g){

          s[i,] <- c(temp.beta,0)

        } else {

          s[i,] <- c(temp.beta[1:(i-1)],0,temp.beta[i:(g-1)])

        } 

      }

     

    }

   

    # lopullinen symmetrisoitu ja skaalattu (max score = 1, min = -1) assosiaatiomatriisi

    s <- 0.5*(s+t(s))

    s <- s-0.5*(max(s)+min(s))

    s <- s/(0.5*(max(s)-min(s)))

    diag(s) <- 1

   

    score.list[[p]] <- s

   

  }

  return(list(data.list = data.list, score.list = score.list,

              pos.m = pos.m, neg.m = neg.m, con.m = con ))

 

}

 

# CALL FUNCTION

 

data1 <- sim.multin.data(100, 3, 20, 0.9, 3, 1000)

data2 <- sim.multin.data(1000, 10, 20, 0.9, 3, 100)

data3 <- sim.multin.data(100, 3, 100, 0.9, 3, 1000)

data4 <- sim.multin.data(1000, 10, 100, 0.9, 3, 100)

 

###################################################

 

 

# FUNCTION COMPUTING FDR WITH DIFFERENT LEVELS OF QVALUE

 

 

fdr.fun <- function(qvals, score, pos, neg){

 

  # save the results

  tp.data <- list()

  sens.data <- list()

 

  # condition positive

  cond.pos <- pos

  # condition negative

  cond.neg <- neg

 

  # loop over all selected qvalues

 

  j <- 1

 

  for(qvalue in qvals){

    cat("qval = ", iter <- qvalue , "\n")

   

    i <- 1

   

    tp.q <- matrix(0, ncol = 5, nrow = length(score))

    sens.q <- matrix(0, ncol = 5, nrow = length(score))

   

    # loop over all generated data sets

    for(d in 1:length(score)){

     

      s <- score[[d]]

      s.up <- s[upper.tri(s)]

      m <- ncol(s)

     

      b <- matrix(0, m, m)

   

      out.density <- density(s.up, adjust = 2, from = -1, to = 1)

      b[upper.tri(b, diag=FALSE)] <- (dnorm(s.up,mean(s.up),sd(s.up))/approx(out.density$x, out.density$y,

                                                                            s.up, method = "linear")$y)

     

      apu <- matrix(0, m, m)

      apu[upper.tri(b, diag=FALSE)] <- 1

      apu[which(b > qvalue, arr.ind = TRUE)] <- 0

     

      fdr.m <- which(apu == 1, arr.ind=T)

     

      # test positive

      test.pos.fdr <- matrix(0, ncol = m, nrow = m)

      test.pos.fdr[fdr.m] <- 1

     

      # test negative

      test.neg.fdr <- matrix(1, ncol = m, nrow = m)

      test.neg.fdr <- ifelse(upper.tri(test.neg.fdr), 1, 0)

      test.neg.fdr[fdr.m] <- 0

     

      # true positives

      tp.m.fdr <- cond.pos+test.pos.fdr

      tp.fdr <- length(which(tp.m.fdr == 2))

     

      # false positives

      fp.m.fdr <- cond.neg + test.pos.fdr

      fp.fdr <- length(which(fp.m.fdr == 2))

     

      # true negatives

      tn.m.fdr <- cond.neg + test.neg.fdr

      tn.fdr <- length(which(tn.m.fdr == 2))

     

      # false negatives

      fn.m.fdr <- cond.pos + test.neg.fdr

      fn.fdr <- length(which(fn.m.fdr == 2))

     

      # sensitivity

      if(tp.fdr+fn.fdr == 0){

        sens.fdr <- 0

      } else {

        sens.fdr <- tp.fdr/(tp.fdr+fn.fdr)

      }

     

      # specificity

      if(fp.fdr+tn.fdr == 0){

        spec.fdr <- 0

      } else {

        spec.fdr <- tn.fdr/(fp.fdr+tn.fdr)

      }

     

      # true discovery rate

      if(tp.fdr+fp.fdr == 0){

        tdr.fdr <- 0

      } else {

        tdr.fdr <- tp.fdr/(tp.fdr+fp.fdr)

      }

     

      # true non-discovery rate

      if(tn.fdr+fn.fdr == 0){

        tnr.fdr <- 0

      } else {

        tnr.fdr <- tn.fdr/(tn.fdr+fn.fdr)

      }

   

      # save the results into ith row of a matrix

      tp.q[i,1] <- qvalue

      tp.q[i,2] <- tp.fdr

      tp.q[i,3] <- fp.fdr

      tp.q[i,4] <- tn.fdr

      tp.q[i,5] <- fn.fdr

     

      sens.q[i,1] <- qvalue

      sens.q[i,2] <- sens.fdr

      sens.q[i,3] <- spec.fdr

      sens.q[i,4] <- tdr.fdr

      sens.q[i,5] <- tnr.fdr

     

      i <- i+1

    }

   

    tp.data[[j]] <- tp.q

    sens.data[[j]] <- sens.q

   

    j <- j+1

   

  }

  return(list(tp.list = tp.data, sens.list = sens.data ))

}

 

 

## CALL FUNCTION

 

qvals <- seq(0.0, 1.0, by = 0.05)

# no weights

FDR1 <- fdr.fun(qvals, data1$score.list, data1$pos.m, data1$neg.m)

FDR2 <- fdr.fun(qvals, data2$score.list, data2$pos.m, data2$neg.m)

FDR3 <- fdr.fun(qvals, data3$score.list, data3$pos.m, data3$neg.m)

FDR4 <- fdr.fun(qvals, data4$score.list, data4$pos.m, data4$neg.m)

 

 

## correction for SEs, qvalue = 0.2

# list with qval = 0.2

# P <- 100

# temp1 <- FDR4$sens.list[[4]]

# temp.mean <- mean(temp1[,2])

# temp.mean

# var.sens <- var(temp1[,2])

# se.sens <-  sqrt(1/P*(var.sens))

# round(se.sens, 3)

# temp.mean <- mean(temp1[,4])

# temp.mean

# var.spec <- var(temp1[,4])

# se.spec <- sqrt(1/P*(var.spec))

# round(se.spec, 3)

# temp.mean <- mean(temp1[,6])

# temp.mean

# var.tdr <- var(temp1[,6])

# se.tdr <- sqrt(1/P*(var.tdr))

# round(se.tdr, 3)

# temp.mean <- mean(temp1[,8])

# temp.mean

# var.tnr <- var(temp1[,8])

# se.tnr <- sqrt(1/P*(var.tnr))

# round(se.tnr, 3)

# # hist(data1$score.list[[1]][upper.tri(data1$score.list[[1]])])

#

means1fdr <- lapply(FDR1$sens.list, colMeans)

means2 <- lapply(FDR2$sens.list, colMeans)

means3fdr <- lapply(FDR3$sens.list, colMeans)

means4 <- lapply(FDR4$sens.list, colMeans)

 

 

 

Made with Namu6