-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathformat_data.R
97 lines (77 loc) · 4.28 KB
/
format_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
DATA.TYPES <- c('counts', 'logcounts', 'activity')
if (!exists('count_data')) {source('figure_common_functions.R')}
if (!exists('sce.data')) {sce.data <<- dimred_for_umap(load_data_into_sce(count.data = dplyr::select(count_data, -META.DATA.COLUMNS),
meta.data = meta_data),
random_state=RANDOM.SEED)}
OUT.DIR <- "shiny_data"
dir.create(OUT.DIR, showWarnings = FALSE)
# Read in the learned network
network.learned <- read.table(file.path(WORKING.PATH, NETWORK.PATH, NETWORK.FILE), sep = "\t", header = TRUE)
rownames(network.learned) <- network.learned$target
network.learned$target <- NULL
# Calculate transcription factor activity from a network and an expression matrix
tfa <- function(prior, exp.mat, noself=TRUE, dup.self=TRUE) {
tfwt <- apply(prior != 0, 2, sum) > 0
duplicates <- c()
if (dup.self) {
duplicates <- duplicated(prior[, tfwt], MARGIN=2) |
duplicated(prior[, tfwt], MARGIN=2, fromLast = TRUE)
duplicates <- colnames(prior)[tfwt][duplicates]
}
tfs <- setdiff(colnames(prior), duplicates)
tfs <- intersect(tfs, rownames(prior))
if (noself) {
diag(prior[tfs, tfs]) <- 0
}
activities <- matrix(0, ncol(prior), ncol(exp.mat),
dimnames=list(colnames(prior), colnames(exp.mat)))
if (any(tfwt)) {
activities[tfwt, ] <- corpcor::pseudoinverse(prior[, tfwt, drop=FALSE]) %*% exp.mat
}
use.exp <- intersect(colnames(prior)[!tfwt], rownames(exp.mat))
activities[use.exp, ] <- exp.mat[use.exp, ]
return(activities)
}
count.for.activity <- as.matrix(t(dplyr::select(count_data, -META.DATA.COLUMNS)[, rownames(network.learned)]))
activities <- tfa(as.matrix(network.learned), count.for.activity)
SingleCellExperiment::reducedDim(sce.data, "activity") <- t(activities)
sce.obj.fig3 <- load_data_into_sce(count.data = dplyr::select(count_data, -META.DATA.COLUMNS), meta.data = meta_data)
sce.conditions <- list()
for (condition in rev(CONDITION.LEVELS)) {sce.conditions[[condition]] <- sce.obj.fig3[, sce.obj.fig3$Condition == condition]}
rm(sce.obj.fig3)
print("Normalizing Figure 3 Data")
sce.conditions <- lapply(sce.conditions, normalizeSCE)
print("PCA for Figure 3 Data")
sce.conditions <- lapply(sce.conditions, function(x) {runPCA(x, ncomponents=50, ntop=Inf, method='irlba', exprs_values='logcounts')})
print("UMAP for Figure 3 Data")
sce.conditions <- lapply(sce.conditions, function(x) {runUMAP(x, random_state=RANDOM.SEED, min_dist=0.1, use_dimred='PCA')})
print("SNN for Figure 3 Data")
sce.conditions <- lapply(sce.conditions, apply.snn.cluster)
fig3.data <- NULL
for (condition in unique(count_data$Condition)) {
df.umap <- as.data.frame(reducedDim(sce.conditions[[condition]], 'UMAP'))
colnames(df.umap) <- c("UMAP.FIG3.1", "UMAP.FIG3.2")
df.umap$Cluster <- sce.conditions[[condition]]$cluster
df.umap$Condition <- condition
fig3.data <- rbind(fig3.data, df.umap)
}
# Write the meta data with UMAP coords
df <- as.data.frame(reducedDim(sce.data, 'UMAP'))
colnames(df) <- c('UMAP1', 'UMAP2')
df <- cbind(meta_data[, c("Condition", "Genotype_Group", "Genotype")], df)
df$UMI <- colSums(SummarizedExperiment::assay(sce.data, 'counts'))
print(all(df$Condition == fig3.data$Condition))
fig3.data$Condition <- NULL
df <- cbind(df, fig3.data)
write.table(df, gzfile(file.path(OUT.DIR, "jackson_2019_meta_data.tsv.gz"), open = "w"), quote = FALSE, sep = "\t", row.names = FALSE)
gene.names <- colnames(count_data)[!colnames(count_data) %in% META.DATA.COLUMNS]
for (gene.name in gene.names) {
gene.data <- as.data.frame(t(SummarizedExperiment::assay(sce.data[gene.name, ], 'counts')))
gene.data <- cbind(gene.data, as.data.frame(t(SummarizedExperiment::assay(sce.data[gene.name, ], 'logcounts'))))
if (!gene.name %in% colnames(SingleCellExperiment::reducedDim(sce.data, 'activity'))) {gene.data$activity <- NA}
else {gene.data <- cbind(gene.data, as.data.frame(SingleCellExperiment::reducedDim(sce.data, 'activity')[, gene.name]))}
colnames(gene.data) <- c("counts", "logcounts", "activity")
file.connection <- gzfile(file.path(OUT.DIR, paste0(gsub("-", "\\.", gene.name), ".tsv.gz")), open = "w")
write.table(gene.data, file.connection, quote = FALSE, sep = "\t", row.names = FALSE)
close(file.connection)
}