-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathcreateREFgenomesForPhasing.py
More file actions
122 lines (100 loc) · 4.2 KB
/
createREFgenomesForPhasing.py
File metadata and controls
122 lines (100 loc) · 4.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/env python2
'''
This script makes a references of two parental genomes by summarizing alleles.
inputfile:
#CHROM POS REF ALT ind1 ind2 ind3 ind4
scaffold_1 1 A T ./. ./. A/A A/A
scaffold_1 12 G A,* A/A A/A G/G G/G
scaffold_1 54 C T,* */* */* C/T C/C
scaffold_1 81 T TA TA/TA TA/TA T/T T/T
outputfile:
#CHROM POS Species1 Species2
scaffold_1 12 A G
scaffold_1 54 * C,T
scaffold_1 81 TA T
command:
python createREFgenomesForPhasing.py -i inputfile -o outputfile -s1 ind1,ind2 -s2 ind3,ind4 -m 0.1
contact Dmytro Kryvokhyzha dmytro.kryvokhyzha@evobio.eu
'''
import argparse, re, collections, sys
############################ options ##############################
class CommandLineParser(argparse.ArgumentParser):
def error(self, message):
sys.stderr.write('error: %s\n' % message)
self.print_help()
sys.exit(2)
parser = CommandLineParser()
parser.add_argument('-i', '--input', help = 'name of the input file', type=str, required=True)
parser.add_argument('-o', '--output', help = 'name of the output file', type=str, required=True)
parser.add_argument('-s1', '--samples1', help = 'column names of the species A (comma delimited)', type=str, required=True)
parser.add_argument('-s2', '--samples2', help = 'column names of the species B (comma delimited)', type=str, required=True)
parser.add_argument('-m', '--missingness', help = 'allowed fraction of missing data per site per species', type=float, required=True)
args = parser.parse_args()
############################ functions ###########################
def index_sample(header, samplList):
''' index position of samples in file '''
ind_names = [str(j) for j in re.findall(r'[^,\s]+', samplList)]
ind_index = []
for i in ind_names:
indnumber = header.index(i)
ind_index.append(indnumber)
return ind_index
def extract_genotypes(genotypes, ind_index):
''' extract genotypes using samples index '''
alleles = []
for g in ind_index:
alleles.append(genotypes[g])
# remove missing data:
allelesNoNs = [x for x in alleles if not (x == './.')]
# split alleles:
allelesNoNsSplit = [i for sublist in [i.split("/") for i in allelesNoNs] for i in sublist]
return allelesNoNsSplit, len(alleles), len(allelesNoNs)
############################ script ##############################
miss = args.missingness
counter = 0
counterProcessed = 0
counterMissing = 0
print('Opening the file...')
with open(args.input) as datafile:
header_line = datafile.readline()
header_words = header_line.split()
# index samples of the two species
ind_sp1 = index_sample(header_words, args.samples1)
ind_sp2 = index_sample(header_words, args.samples2)
print('Creating the output file...')
sp12_output = open(args.output, 'w')
sp12_output.write("#CHROM\tPOS\tSpecies1\tSpecies2\n")
print('Creating reference genomes...')
for line in datafile:
words = line.split()
chr_pos = words[0:2]
rowName = '\t'.join(str(e) for e in chr_pos) # make chr pos row names
# extract genotypes using samples index
sp1_alleles = extract_genotypes(words, ind_sp1)
sp2_alleles = extract_genotypes(words, ind_sp2)
sp1_alleles_noNSplit = sp1_alleles[0]
sp2_alleles_noNSplit = sp2_alleles[0]
len_sp1_alleles = float(sp1_alleles[1])
len_sp2_alleles = float(sp2_alleles[1])
len_sp1_alleles_noN = float(sp1_alleles[2])
len_sp2_alleles_noN = float(sp2_alleles[2])
# if missing data filter is passed, output the genotypes
if len_sp1_alleles_noN >= len_sp1_alleles*miss and len_sp2_alleles >= len_sp2_alleles*miss:
counterProcessed += 1
sp1_allelesSet = set(sp1_alleles_noNSplit)
sp2_allelesSet = set(sp2_alleles_noNSplit)
sp1AltPrint = ','.join(str(al) for al in sp1_allelesSet)
sp2AltPrint = ','.join(str(al) for al in sp2_allelesSet)
sp12_output.write("%s\t%s\t%s\n" % (rowName, sp1AltPrint, sp2AltPrint))
else:
counterMissing += 1
# track the progress:
counter += 1
if counter % 1000000 == 0:
print str(counter), "lines processed"
datafile.close()
sp12_output.close()
print('Done!\n')
# print statistics on the screen
print str(counterProcessed), "sites processed"
print str(counterMissing), "sites with too many missing genotypes in both species\n"