-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmethyl_fastq_pipeline.py
106 lines (80 loc) · 3.06 KB
/
methyl_fastq_pipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import argparse
from multiprocessing import Pool
import methyl_utils
import os
import subprocess
parser = argparse.ArgumentParser()
parser.add_argument("-c", "--cores", type=str)
parser.add_argument("-i", "--indir", type=str)
parser.add_argument("-s", "--sample", type=str)
parser.add_argument("-b", "--barcodes", type=str)
parser.add_argument("-d", "--total_droplets", type=int)
parser.add_argument("-l", "--limit", default=False, action="store_true")
args = parser.parse_args()
cores = args.cores
indir = args.indir
sample = args.sample
barcodes = args.barcodes
total_droplets = args.total_droplets
limit = args.limit
py_dir = os.path.dirname(os.path.abspath(__file__))
######################################################
methyl_utils.split_fastq_by_lines(indir, sample, 40e6)
######################################################
parts = methyl_utils.find_sub_fastq_parts(indir, sample)
args = [(indir, sample, part, limit) for part in parts]
pool = Pool(int(cores))
results = pool.starmap(methyl_utils.extract_clean_fastq, args)
pool.close()
pool.join()
######################################################
methyl_utils.aggregate_bc_dicts(indir, sample)
methyl_utils.write_bc_raw_reads(indir, sample, 3)
######################################################
if barcodes is None:
whitelist = f"{py_dir}/data/737K-arc-v1.txt.gz"
methyl_utils.write_bc_whitelist(indir, sample, whitelist)
######################################################
ref_generation_log = f"{indir}/{sample}/{sample}_ref/Log.out"
if os.path.isfile(ref_generation_log):
print("ref generation log", ref_generation_log, " exists")
with open(ref_generation_log, "r") as file:
lines = file.readlines()
last_line = lines[-1].strip()
print(last_line)
if last_line != "DONE: Genome generation, EXITING":
print("Genome generation log is not complete")
else:
print(
"ref generation log", ref_generation_log, " does not exist, will make reference"
)
subprocess.run(
[
f"{py_dir}/scripts/barcode_ref.sh",
f"{indir}/{sample}/{sample}_bc_whitelist.fasta",
f"{indir}/{sample}/{sample}_ref/",
]
)
######################################################
alignment_log = f"{indir}/{sample}/{sample}_matchingLog.final.out"
if os.path.isfile(alignment_log):
print("alignment log", alignment_log, " exists")
else:
print("alignment log", alignment_log, " does not exist, will perform alignment")
subprocess.run(
[
f"{py_dir}/scripts/barcode_align.sh",
f"{indir}/{sample}/{sample}_bc_raw_reads.fasta",
f"{indir}/{sample}/{sample}_ref/",
f"{indir}/{sample}/{sample}_matching",
cores,
"-1",
]
)
######################################################
methyl_utils.processing_matching(indir, sample, AS_min=14)
if total_droplets is None:
total_droplets = 50000
methyl_utils.filered_barcodes(indir, sample, total_droplets)
methyl_utils.split_bcs_to_batches(indir, sample)
######################################################