## cPLS -FUNCTION
# data = takes in a matrix of raw read count data (integers),
# where rows represent genes and columns represent samples
# ncom = number of PLS-components computed, 3 by default
cPLS <- function(data, ncom=3){
library(MASS)
library(mgcv)
# transpose data
tdata <- t(data)
# number of genes
g <- ncol(tdata)
# total read counts
tot.c <- as.vector(apply(tdata, 1, sum))
# scale data
data.sc <- scale(tdata)
# matrix for association scores
s <- matrix(0, g, g)
# function for stand. pls-components
standr <- function(x) x/as.numeric(sqrt(t(x)%*%x))
for(i in 1:g){
cat("gene = ", i, "\n")
# PLS-components
X <- data.sc[,-i]
y <- data.sc[,i]
c <- matrix(0, g-1, ncom)
TT <- matrix(0, nrow(data.sc), ncom)
tX<- X
for(k in 1:ncom){ # construct ncom PLS components
c[,k] <- standr(t(tX)%*%y)
TT[,k] <- tX%*%c[,k]
tX <- tX-TT[,k]%*%solve(t(TT[,k])%*%TT[,k])%*%t(TT[,k])%*%tX
}
# glm with negative-binomial link
temp.data <- as.data.frame(cbind(tdata[,i], TT, tot.c))
comp.names <- paste("C", 1:ncom, sep = '')
colnames(temp.data) <- c('y', comp.names, 't.counts')
names <- colnames(temp.data[2:(ncom+1)])
osa1 <- paste("y~")
osa2 <- paste(names, collapse = "+")
form <- paste(osa1, osa2, sep = "")
form2 <- as.formula(form)
mod <- gam(form2, offset=log(t.counts), family=nb(),data=temp.data)
summary(mod)
b <- as.matrix(coef(mod)[-1])
beta <- c%*%b
if (i == 1){
s[i,] <- c(0,beta)
} else {
if (i == g){
s[i,] <- c(beta,0)
} else {
s[i,] <- c(beta[1:(i-1)],0,beta[i:(g-1)])
}
}
}
# symmetrize and scale the scores (max score = 1, min = -1)
s <- 0.5*(s+t(s))
s <- s-0.5*(max(s)+min(s))
s <- s/(0.5*(max(s)-min(s)))
diag(s) <- 1
return(list(score.m = s))
}