phasing-genomes/createREFgenomesForPhasing.py at master · evodify/phasing-genomes · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/env python2
'''
This script makes a references of two parental genomes by summarizing alleles.

inputfile:
#CHROM POS REF ALT ind1 ind2 ind3 ind4
scaffold_1  1 A T ./. ./. A/A A/A
scaffold_1  12 G A,* A/A A/A G/G G/G
scaffold_1  54 C T,* */* */* C/T C/C
scaffold_1  81 T TA  TA/TA TA/TA T/T T/T

outputfile:
#CHROM  POS Species1  Species2
scaffold_1  12 A G
scaffold_1  54 * C,T
scaffold_1  81 TA T

command:
python createREFgenomesForPhasing.py -i inputfile -o outputfile -s1 ind1,ind2 -s2 ind3,ind4 -m 0.1

contact Dmytro Kryvokhyzha dmytro.kryvokhyzha@evobio.eu
'''

import argparse, re, collections, sys

############################ options ##############################

class CommandLineParser(argparse.ArgumentParser):
   def error(self, message):
      sys.stderr.write('error: %s\n' % message)
      self.print_help()
      sys.exit(2)

parser = CommandLineParser()
parser.add_argument('-i', '--input', help = 'name of the input file', type=str, required=True)
parser.add_argument('-o', '--output', help = 'name of the output file', type=str, required=True)
parser.add_argument('-s1', '--samples1', help = 'column names of the species A (comma delimited)', type=str, required=True)
parser.add_argument('-s2', '--samples2', help = 'column names of the species B (comma delimited)', type=str, required=True)
parser.add_argument('-m', '--missingness', help = 'allowed fraction of missing data per site per species', type=float, required=True)
args = parser.parse_args()

############################ functions ###########################

def index_sample(header, samplList):
  ''' index position of samples in file '''
  ind_names = [str(j) for j in re.findall(r'[^,\s]+', samplList)]
  ind_index = []
  for i in ind_names:
    indnumber = header.index(i)
    ind_index.append(indnumber)
  return ind_index

def extract_genotypes(genotypes, ind_index):
  ''' extract genotypes using samples index '''
  alleles = []
  for g in ind_index:
    alleles.append(genotypes[g])
  # remove missing data:
  allelesNoNs = [x for x in alleles if not (x == './.')]
  # split alleles:
  allelesNoNsSplit = [i for sublist in [i.split("/") for i in allelesNoNs] for i in sublist]
  return allelesNoNsSplit, len(alleles), len(allelesNoNs)

############################ script ##############################

miss = args.missingness
counter = 0
counterProcessed = 0
counterMissing = 0

print('Opening the file...')
with open(args.input) as datafile:
  header_line = datafile.readline()
  header_words = header_line.split()

  # index samples of the two species
  ind_sp1 = index_sample(header_words, args.samples1)
  ind_sp2 = index_sample(header_words, args.samples2)

  print('Creating the output file...')
  sp12_output = open(args.output, 'w')
  sp12_output.write("#CHROM\tPOS\tSpecies1\tSpecies2\n")

  print('Creating reference genomes...')
  for line in datafile:
    words = line.split()
    chr_pos = words[0:2]
    rowName = '\t'.join(str(e) for e in chr_pos) # make chr pos row names

    # extract genotypes using samples index
    sp1_alleles = extract_genotypes(words, ind_sp1)
    sp2_alleles = extract_genotypes(words, ind_sp2)
    sp1_alleles_noNSplit = sp1_alleles[0]
    sp2_alleles_noNSplit = sp2_alleles[0]
    len_sp1_alleles = float(sp1_alleles[1])
    len_sp2_alleles = float(sp2_alleles[1])
    len_sp1_alleles_noN = float(sp1_alleles[2])
    len_sp2_alleles_noN = float(sp2_alleles[2])

    # if missing data filter is passed, output the genotypes
    if len_sp1_alleles_noN >= len_sp1_alleles*miss and len_sp2_alleles >= len_sp2_alleles*miss:
      counterProcessed += 1
      sp1_allelesSet = set(sp1_alleles_noNSplit)
      sp2_allelesSet = set(sp2_alleles_noNSplit)
      sp1AltPrint = ','.join(str(al) for al in sp1_allelesSet)
      sp2AltPrint = ','.join(str(al) for al in sp2_allelesSet)
      sp12_output.write("%s\t%s\t%s\n" % (rowName, sp1AltPrint, sp2AltPrint))
    else:
      counterMissing += 1

    # track the progress:
    counter += 1
    if counter % 1000000 == 0:
      print str(counter), "lines processed"

datafile.close()
sp12_output.close()
print('Done!\n')

# print statistics on the screen
print str(counterProcessed), "sites processed"
print str(counterMissing), "sites with too many missing genotypes in both species\n"