bioinformatics-python/task.py at main · wavetearz/bioinformatics-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import re
from itertools import islice

region_length = 20

with open("example.sub") as f:
    c = f.read().strip().split("\n")

i = 0
strand = "".join(filter(str.isalpha, c[1]))

for line in c:
    c[i] = re.sub(r'[^\(\)\.]', "", line)
    i += 1
del c[:2], c[-1]


row_length = len(c[0]) - 1 # should be the same for every row
column_scores = []

def mean(t):
    return sum(t) / region_length


def get_melting_temperature(seq):
    nA, nU = seq.count("A"), seq.count("U")
    nG, nC = seq.count("G"), seq.count("C")
    if len(seq) > 14:
        return round(64.9 + 41 * (nG + nC - 16.4) / (nA + nU + nG + nC), 2)
    return round((nA + nU) * 2 + (nG + nC) * 4, 2)

def get_complementary_dna(seq):
    complements = {"A": "U", "U": "A", "G": "C", "C": "G"}
    return "".join(complements[n] for n in seq)


def normalize_temp(temp, min, max):
    return (temp - min) / (max - min)

# calculate the average values for every column
for i in range(row_length):
    column_scores.append(0)
    k = 0
    for j in c:
        k += 1
        if j[i] == ".":
            column_scores[i] += 1
    column_scores[i] = column_scores[i] / k


average_values = {}

# calculate the average values (of number on index from i to region_length) from the previously created column scores list
for i in range(row_length):
    t = column_scores[i:][:region_length]
    if len(t) == 0:
        break
    average_values[i] = mean(t)

# find out the index of highest average value
index = max(average_values, key=average_values.get)

# find out the highest average value
highest_average = average_values[index]

# separate the strand
final_strand = strand[index:][:region_length]

print(f"Best region: {final_strand}\tstarting from position: {index + 1}")
print(f"Melting temperature: {get_melting_temperature(final_strand)}°C")
print(f"Complementary sequence: {"".join(get_complementary_dna(final_strand))}")


print("\n\n\ntask 2: taking the melting temperature into account and calculating the score (might not work properly)")
melting_temperatures = {}
for i in range(len(strand)):
    t = strand[i:][:region_length]
    melting_temperatures[i] = get_melting_temperature(t) # calculate melting temperature for every strand and put it into dictionary; key - strand starting position; value - temperature

# min / max temperatures from the dictionary in order to further normalize the temps
min_temp = min(list(melting_temperatures.values()))
max_temp = max(list(melting_temperatures.values()))

for t in list(melting_temperatures.keys()):
    melting_temperatures[t] = (melting_temperatures[t] - min_temp) / (max_temp - min_temp) # normalize and replace the value in the dict

score = {}
for i in range(row_length):
    score[i] = round(0.3 * average_values[i] + 0.7 * list(melting_temperatures.values())[i], 7) # calculate the score

sorted_score = dict(sorted(score.items(), key=lambda item: item[1], reverse=True)) # sort dictionary by value (temperature)
print("key - strand starting position; value - score")
print(list(islice(sorted_score.items(), 5))) # top 5 scores