-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmaps_processing.py
135 lines (109 loc) · 5.89 KB
/
maps_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
from confidence_calculation import *
import numpy as np
import cooler
import cooltools
import cooltools.expected
import warnings
warnings.filterwarnings("ignore")
def average_tad(file_names, k_parameter, tads_positions, area_size):
''' Function for calculation of average picture of Hi-C signal around tads boundaries
Parameters
----------
file_names : list of str
Path to two Hi-C replicates
k_parameter : int
Special parameter for confidence calculation
tads_positions : pandas.DataFrame
Positions of all tads with columns <Chr,Start,End>
area_size : int
Size of a window around tads boundaries to extract
Returns
-------
avg_tad : dict of two numpy.ndarray
Dict with 2 arrays of size area_size x area_size with averaged values from original 2 Hi-C matrices around tads boundaries'''
# Download Hi-C replicates
mtx = {}
mtx['rep_1'], regions, bin_size = download_matrix(file_names[0])
mtx['rep_2'], regions, bin_size = download_matrix(file_names[1])
# Calculate confidence parameter for every chromosome
confidence_chrm = {}
for chrm in regions:
confidence_chrm[chrm] = calculate_confidence(mtx['rep_1'](chrm,chrm),mtx['rep_2'](chrm,chrm),k_parameter)
# Normalization to confidence and average tad calculation
avg_tad = {}
for rep in ['rep_1','rep_2']:
tads_areas = [] # list of areas of Hi-C values around tads
confidence_areas = [] # list of areas of confidence values around tads
for chrm in regions:
# Extract areas around tads
# Left boundaries
tads_areas += [extract_tad(mtx[rep](chrm,chrm), tads_positions.loc[tad,:], bin_size, area_size, boundary='Left') for tad in tads_positions[tads_positions['Chr']==chrm[0]].index]
confidence_areas += [extract_tad(confidence_chrm[chrm], tads_positions.loc[tad,:], bin_size, area_size, boundary='Left') for tad in tads_positions[tads_positions['Chr']==chrm[0]].index]
# Right boundaries
tads_areas += [extract_tad(mtx[rep](chrm,chrm), tads_positions.loc[tad,:], bin_size, area_size, boundary='Right') for tad in tads_positions[tads_positions['Chr']==chrm[0]].index]
confidence_areas += [extract_tad(confidence_chrm[chrm], tads_positions.loc[tad,:], bin_size, area_size, boundary='Right') for tad in tads_positions[tads_positions['Chr']==chrm[0]].index]
# Generate numpy arrays and check if all areas have the proper size
tads_areas = np.asarray([i for i in tads_areas if i.shape[0] == area_size*2+1])
confidence_areas = np.asarray([i for i in confidence_areas if i.shape[0] == area_size*2+1])
confidence_areas /= np.nansum(confidence_areas,axis=0) # Normalize confidence for every bin
# Averaging through all tads with normalization to confidence
avg_tad[rep] = np.nansum(confidence_areas * tads_areas,axis=0)
return avg_tad
def choose_k(file_names, max_k = 5, window_size = 5):
''' Calculation of parameter k for confidence formulae, it takes two replicates, itteratively use different k for normalization and finds the best k based on the correlation of normalized replicates.
Parameters
----------
file_names : list of str
Path to two Hi-C replicates
max_k : int
Maximum value for k
window_size : int
Size of the window for insulation score calculation
Returns
-------
k : int
k parameter value '''
correlations = replicates_correlation(window_size, max_k, file_names) # The list of replicates' correlations for every k used
k = np.array(correlations).argmax()+1 # Which k gives highest replicates' correlation
return k
def downsampling(file_names, k_parameter, factor, output_file_name):
''' Function to downscale the Hi-C map taking into acount confidence parameter
Parameters
----------
file_names : list of str
Path to two Hi-C replicates
k_parameter : int
Special parameter for confidence calculation
factor :
Coefficient for downsampling, the shape of resulting map will be equal to original shape / factor
output_file_name : str
Name of file to write results to
Returns
-------
The result of this function is generated <file_name> file with normalized contacts for downsampled Hi-C map.
The resulted file is a tsv pairs file with columns: chr1 start1 end1 chr2 start2 end2 value '''
# Download Hi-C maps for replicates
cooler_rep1 = cooler.Cooler(file_names[0])
cooler_rep2 = cooler.Cooler(file_names[1])
balanced_mtx1 = cooler_rep1.matrix(balance=True)
balanced_mtx2 = cooler_rep2.matrix(balance=True)
bin_size = cooler_rep1.binsize # Size of bins
regions = [(chrom, 0, cooler_rep1.chromsizes[chrom]) for chrom in cooler_rep1.chromnames] # Names and sizes of chromosomes
# Observed over expected
expected_rep1 = cooltools.expected.cis_expected(cooler_rep1, regions, use_dask=True)
expected_rep2 = cooltools.expected.cis_expected(cooler_rep2, regions, use_dask=True)
getmatrix_rep1 = cooltools.saddle.make_cis_obsexp_fetcher(cooler_rep1, (expected_rep1, 'balanced.avg'))
getmatrix_rep2 = cooltools.saddle.make_cis_obsexp_fetcher(cooler_rep2, (expected_rep2, 'balanced.avg'))
# Calculate confidence
confidence = {chrm[0] : calculate_confidence(getmatrix_rep1(chrm,chrm),getmatrix_rep2(chrm,chrm),k_parameter) for chrm in regions}
# Downsampling
downsample(output_file_name+'_rep1',
balanced_mtx1,
bin_size, factor,
regions,
confidence)
downsample(output_file_name+'_rep2',
balanced_mtx2,
bin_size, factor,
regions,
confidence)