Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,9 @@ Imports:
RColorBrewer,
mixtools,
cluster,
MCMCpack
RoxygenNote: 7.1.2
MCMCpack,
Matrix
RoxygenNote: 7.3.3
Suggests:
knitr,
rmarkdown
Expand Down
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
# Generated by roxygen2: do not edit by hand
importFrom(Matrix,colSums)
importFrom(Matrix,rowSums)
export(copykat)
export(CNA.MCMC)
export(annotateGenes.hg20)
Expand Down
4 changes: 2 additions & 2 deletions R/annotateGenes.hg20.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ annotateGenes.hg20 <- function(mat, ID.type="S"){
anno <- full.anno[which(as.vector(full.anno$ensembl_gene_id) %in% shar),]
anno <- anno[!duplicated(anno$hgnc_symbol),]
anno <- anno[order(match(anno$ensembl_gene_id, rownames(mat))),]
data <- cbind(anno, mat)
list(anno = anno, counts = mat)

}else if(substring(ID.type,1,1) %in% c("S", "s")) {

Expand All @@ -27,7 +27,7 @@ annotateGenes.hg20 <- function(mat, ID.type="S"){
anno <- full.anno[which(as.vector(full.anno$hgnc_symbol) %in% shar),]
anno <- anno[!duplicated(anno$hgnc_symbol),]
anno <- anno[order(match(anno$hgnc_symbol, rownames(mat))),]
data <- cbind(anno, mat)
list(anno = anno, counts = mat)
}
}

4 changes: 2 additions & 2 deletions R/annotateGenes.mm10.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ annotateGenes.mm10 <- function(mat, ID.type="S", full.anno=full.anno.mm10){
anno <- full.anno[which(as.vector(full.anno$ensembl_gene_id) %in% shar),]
anno <- anno[!duplicated(anno$mgi_symbol),]
anno <- anno[order(match(anno$ensembl_gene_id, rownames(mat))),]
data <- cbind(anno, mat)
list(anno = anno, counts = mat)

}else if(substring(ID.type,1,1) %in% c("S", "s")) {

Expand All @@ -32,7 +32,7 @@ annotateGenes.mm10 <- function(mat, ID.type="S", full.anno=full.anno.mm10){
anno <- full.anno[which(as.vector(full.anno$mgi_symbol) %in% shar),]
anno <- anno[!duplicated(anno$mgi_symbol),]
anno <- anno[order(match(anno$mgi_symbol, rownames(mat))),]
data <- cbind(anno, mat)
list(anno = anno, counts = mat)
}
}

138 changes: 73 additions & 65 deletions R/copykat.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

#'
#' test.pred <- test.ck$prediction
#' @importFrom Matrix colSums rowSums
#' @export
###

Expand All @@ -38,15 +39,15 @@ start_time <- Sys.time()
print("step1: read and filter data ...")
print(paste(nrow(rawmat), " genes, ", ncol(rawmat), " cells in raw data", sep=""))

genes.raw <- apply(rawmat, 2, function(x)(sum(x>0)))
genes.raw <- colSums(rawmat != 0)

if(sum(genes.raw> min.gene.per.cell)==0) stop("none cells have more than min.gene.per.cell")
if(sum(genes.raw<min.gene.per.cell)>1){
rawmat <- rawmat[, -which(genes.raw< min.gene.per.cell)]
print(paste("filtered out ", sum(genes.raw<=min.gene.per.cell), " cells with less than min.gene.per.cell; remaining ", ncol(rawmat), " cells", sep=""))
}
##
der<- apply(rawmat,1,function(x)(sum(x>0)))/ncol(rawmat)
der <- rowSums(rawmat != 0)/ncol(rawmat)

if(sum(der>LOW.DR)>=1){
rawmat <- rawmat[which(der > LOW.DR), ]; print(paste(nrow(rawmat)," genes past LOW.DR filtering", sep=""))
Expand All @@ -61,66 +62,69 @@ start_time <- Sys.time()

print("step 2: annotations gene coordinates ...")
if(genome=="hg20"){
anno.mat <- annotateGenes.hg20(mat = rawmat, ID.type = id.type) #SYMBOL or ENSEMBLE
anno.list <- annotateGenes.hg20(mat = rawmat, ID.type = id.type) #SYMBOL or ENSEMBLE
} else if(genome=="mm10"){
anno.mat <- annotateGenes.mm10(mat = rawmat, ID.type = id.type) #SYMBOL or ENSEMBLE
anno.list <- annotateGenes.mm10(mat = rawmat, ID.type = id.type) #SYMBOL or ENSEMBLE
dim(rawmat)
}
anno.mat <- anno.mat[order(as.numeric(anno.mat$abspos), decreasing = FALSE),]
ord <- order(as.numeric(anno.list$anno$abspos), decreasing = FALSE)
anno.list$anno <- anno.list$anno[ord, ]
anno.list$counts <- anno.list$counts[ord, ]

# print(paste(nrow(anno.mat)," genes annotated", sep=""))
# print(paste(nrow(anno.list$anno)," genes annotated", sep=""))

### module 3 removing genes that are involved in cell cycling

if(genome=="hg20"){
HLAs <- anno.mat$hgnc_symbol[grep("^HLA-", anno.mat$hgnc_symbol)]
toRev <- which(anno.mat$hgnc_symbol %in% c(as.vector(cyclegenes[[1]]), HLAs))
HLAs <- anno.list$anno$hgnc_symbol[grep("^HLA-", anno.list$anno$hgnc_symbol)]
toRev <- which(anno.list$anno$hgnc_symbol %in% c(as.vector(cyclegenes[[1]]), HLAs))
if(length(toRev)>0){
anno.mat <- anno.mat[-toRev, ]
anno.list$anno <- anno.list$anno[-toRev, ]
anno.list$counts <- anno.list$counts[-toRev, ]
}
}
# print(paste(nrow(anno.mat)," genes after rm cell cycle genes", sep=""))
# print(paste(nrow(anno.list$anno)," genes after rm cell cycle genes", sep=""))
### secondary filtering
ToRemov2 <- NULL
for(i in 8:ncol(anno.mat)){
cell <- cbind(anno.mat$chromosome_name, anno.mat[,i])
cell <- cell[cell[,2]!=0,]
if(length(as.numeric(cell))< 5){
rm <- colnames(anno.mat)[i]
ToRemov2 <- c(ToRemov2, rm)
} else if(length(rle(cell[,1])$length)<length(unique((anno.mat$chromosome_name)))|min(rle(cell[,1])$length)< ngene.chr){
rm <- colnames(anno.mat)[i]
ToRemov2 <- c(ToRemov2, rm)
chrom_vec <- anno.list$anno$chromosome_name
ToRemov2 <- character(0)
for(j in seq_len(ncol(anno.list$counts))) {
nz_idx <- which(anno.list$counts[, j] != 0)
if(length(nz_idx) < 5) {
ToRemov2 <- c(ToRemov2, colnames(anno.list$counts)[j])
} else {
chr_nz <- chrom_vec[nz_idx]
rl <- rle(chr_nz)
if(length(rl$lengths) < length(unique(chrom_vec)) || min(rl$lengths) < ngene.chr)
ToRemov2 <- c(ToRemov2, colnames(anno.list$counts)[j])
}
i<- i+1
}

if(length(ToRemov2)==(ncol(anno.mat)-7)) stop("all cells are filtered")
if(length(ToRemov2)==ncol(anno.list$counts)) stop("all cells are filtered")
if(length(ToRemov2)>0){
anno.mat <-anno.mat[, -which(colnames(anno.mat) %in% ToRemov2)]
anno.list$counts <- anno.list$counts[, !(colnames(anno.list$counts) %in% ToRemov2), drop=FALSE]
}

# print(paste("filtered out ", length(ToRemov2), " cells with less than ",ngene.chr, " genes per chr", sep=""))
rawmat3 <- data.matrix(anno.mat[, 8:ncol(anno.mat)])
norm.mat<- log(sqrt(rawmat3)+sqrt(rawmat3+1))
norm.mat<- apply(norm.mat,2,function(x)(x <- x-mean(x)))
colnames(norm.mat) <- colnames(rawmat3)

#print(paste("A total of ", ncol(norm.mat), " cells, ", nrow(norm.mat), " genes after preprocessing", sep=""))
## compute DR2 from sparse counts before densification
DR2 <- rowSums(anno.list$counts != 0) / ncol(anno.list$counts)

##smooth data
##smooth data — normalize per-column to avoid building a full dense rawmat3/norm.mat
print("step 3: smoothing data with dlm ...")
dlm.sm <- function(c){
dlm.sm <- function(j){
col_raw <- as.numeric(anno.list$counts[, j])
col_norm <- log(sqrt(col_raw) + sqrt(col_raw + 1))
col_norm <- col_norm - mean(col_norm)
model <- dlm::dlmModPoly(order=1, dV=0.16, dW=0.001)
x <- dlm::dlmSmooth(norm.mat[, c], model)$s
x<- x[2:length(x)]
x <- x-mean(x)
x <- dlm::dlmSmooth(col_norm, model)$s
x <- x[2:length(x)]
x - mean(x)
}

test.mc <-parallel::mclapply(1:ncol(norm.mat), dlm.sm, mc.cores = n.cores)
norm.mat.smooth <- matrix(unlist(test.mc), ncol = ncol(norm.mat), byrow = FALSE)
test.mc <- parallel::mclapply(seq_len(ncol(anno.list$counts)), dlm.sm, mc.cores = n.cores)
norm.mat.smooth <- matrix(unlist(test.mc), ncol = ncol(anno.list$counts), byrow = FALSE)

colnames(norm.mat.smooth) <- colnames(norm.mat)
colnames(norm.mat.smooth) <- colnames(anno.list$counts)

print("step 4: measuring baselines ...")
if (cell.line=="yes"){
Expand Down Expand Up @@ -182,25 +186,29 @@ start_time <- Sys.time()
}

###use a smaller set of genes to perform segmentation
DR2 <- apply(rawmat3,1,function(x)(sum(x>0)))/ncol(rawmat3)
## DR2 was already computed from sparse counts before dlm smoothing
##relative expression using pred.normal cells
norm.mat.relat <- norm.mat.relat[which(DR2>=UP.DR),]

###filter cells
anno.mat2 <- anno.mat[which(DR2>=UP.DR), ]

ToRemov3 <- NULL
for(i in 8:ncol(anno.mat2)){
cell <- cbind(anno.mat2$chromosome_name, anno.mat2[,i])
cell <- cell[cell[,2]!=0,]
if(length(as.numeric(cell))< 5){
rm <- colnames(anno.mat2)[i]
ToRemov3 <- c(ToRemov3, rm)
} else if(length(rle(cell[,1])$length)<length(unique((anno.mat$chromosome_name)))|min(rle(cell[,1])$length)< ngene.chr){
rm <- colnames(anno.mat2)[i]
ToRemov3 <- c(ToRemov3, rm)
keep_rows2 <- which(DR2 >= UP.DR)
anno.mat2 <- list(
anno = anno.list$anno[keep_rows2, ],
counts = anno.list$counts[keep_rows2, , drop=FALSE]
)

chrom_vec2 <- anno.mat2$anno$chromosome_name
ToRemov3 <- character(0)
for(j in seq_len(ncol(anno.mat2$counts))) {
nz_idx <- which(anno.mat2$counts[, j] != 0)
if(length(nz_idx) < 5) {
ToRemov3 <- c(ToRemov3, colnames(anno.mat2$counts)[j])
} else {
chr_nz <- chrom_vec2[nz_idx]
rl <- rle(chr_nz)
if(length(rl$lengths) < length(unique(chrom_vec)) || min(rl$lengths) < ngene.chr)
ToRemov3 <- c(ToRemov3, colnames(anno.mat2$counts)[j])
}
i<- i+1
}

if(length(ToRemov3)==ncol(norm.mat.relat)) stop ("all cells are filtered")
Expand Down Expand Up @@ -232,7 +240,7 @@ start_time <- Sys.time()

colnames(results$logCNA) <- colnames(norm.mat.relat)
results.com <- apply(results$logCNA,2, function(x)(x <- x-mean(x)))
RNA.copycat <- cbind(anno.mat2[, 1:7], results.com)
RNA.copycat <- cbind(anno.mat2$anno, results.com)

write.table(RNA.copycat, paste(sample.name, "CNA_raw_results_gene_by_cell.txt", sep=""), sep="\t", row.names = FALSE, quote = F)

Expand Down Expand Up @@ -289,8 +297,8 @@ start_time <- Sys.time()
### add a step to plot out gene by cell matrix
if(plot.genes=="TRUE"){

rownames(results.com) <- anno.mat2$hgnc_symbol
chrg <- as.numeric(anno.mat2$chrom) %% 2+1
rownames(results.com) <- anno.mat2$anno$hgnc_symbol
chrg <- as.numeric(anno.mat2$anno$chrom) %% 2+1
rbPal1g <- colorRampPalette(c('black','grey'))
CHRg <- rbPal1(2)[as.numeric(chrg)]
chr1g <- cbind(CHRg,CHRg)
Expand Down Expand Up @@ -318,8 +326,8 @@ start_time <- Sys.time()
### add a step to plot out gene by cell matrix
if(plot.genes=="TRUE"){

rownames(results.com) <- anno.mat2$hgnc_symbol
chrg <- as.numeric(anno.mat2$chrom) %% 2+1
rownames(results.com) <- anno.mat2$anno$hgnc_symbol
chrg <- as.numeric(anno.mat2$anno$chrom) %% 2+1
rbPal1g <- colorRampPalette(c('black','grey'))
CHRg <- rbPal1(2)[as.numeric(chrg)]
chr1g <- cbind(CHRg,CHRg)
Expand Down Expand Up @@ -477,8 +485,8 @@ start_time <- Sys.time()
### add a step to plot out gene by cell matrix
if(plot.genes=="TRUE"){
dim(results.com)
rownames(results.com) <- anno.mat2$hgnc_symbol
chrg <- as.numeric(anno.mat2$chrom) %% 2+1
rownames(results.com) <- anno.mat2$anno$hgnc_symbol
chrg <- as.numeric(anno.mat2$anno$chrom) %% 2+1
rbPal1g <- colorRampPalette(c('black','grey'))
CHRg <- rbPal1(2)[as.numeric(chrg)]
chr1g <- cbind(CHRg,CHRg)
Expand Down Expand Up @@ -511,8 +519,8 @@ start_time <- Sys.time()
### add a step to plot out gene by cell matrix
if(plot.genes=="TRUE"){
dim(results.com)
rownames(results.com) <- anno.mat2$hgnc_symbol
chrg <- as.numeric(anno.mat2$chrom) %% 2+1
rownames(results.com) <- anno.mat2$anno$hgnc_symbol
chrg <- as.numeric(anno.mat2$anno$chrom) %% 2+1
rbPal1g <- colorRampPalette(c('black','grey'))
CHRg <- rbPal1(2)[as.numeric(chrg)]
chr1g <- cbind(CHRg,CHRg)
Expand Down Expand Up @@ -678,15 +686,15 @@ start_time <- Sys.time()
write.table(res, paste(sample.name, "prediction.txt",sep=""), sep="\t", row.names = FALSE, quote = FALSE)

####save copycat CNA
write.table(cbind(anno.mat2[, 1:7], mat.adj), paste(sample.name, "CNA_results.txt", sep=""), sep="\t", row.names = FALSE, quote = F)
write.table(cbind(anno.mat2$anno, mat.adj), paste(sample.name, "CNA_results.txt", sep=""), sep="\t", row.names = FALSE, quote = F)

####%%%%%%%%%%%%%%%%%next heatmaps, subpopulations and tSNE overlay
print("step 10: ploting heatmap ...")
my_palette <- colorRampPalette(rev(RColorBrewer::brewer.pal(n = 3, name = "RdBu")))(n = 999)

rownames(mat.adj) <- anno.mat2$mgi_symbol
chrg <- as.numeric(anno.mat2$chromosome_name) %% 2+1
rle(as.numeric(anno.mat2$chromosome_name))
rownames(mat.adj) <- anno.mat2$anno$mgi_symbol
chrg <- as.numeric(anno.mat2$anno$chromosome_name) %% 2+1
rle(as.numeric(anno.mat2$anno$chromosome_name))
rbPal1g <- colorRampPalette(c('black','grey'))
CHRg <- rbPal1g(2)[as.numeric(chrg)]
chr1g <- cbind(CHRg,CHRg)
Expand Down Expand Up @@ -734,7 +742,7 @@ start_time <- Sys.time()
if(output.seg=="TRUE"){
print("generating seg files for IGV viewer")

thisRatio <- cbind(anno.mat2[, c(2,3,1)], mat.adj)
thisRatio <- cbind(anno.mat2$anno[, c(2,3,1)], mat.adj)
Short <- NULL
chr <- rle(thisRatio$chromosome_name)[[2]]

Expand Down Expand Up @@ -778,7 +786,7 @@ start_time <- Sys.time()
end_time<- Sys.time()
print(end_time -start_time)

reslts <- list(res, cbind(anno.mat2[, 1:7], mat.adj), hcc)
reslts <- list(res, cbind(anno.mat2$anno, mat.adj), hcc)
names(reslts) <- c("prediction", "CNAmat","hclustering")
return(reslts)

Expand Down