-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathencoding_4bit.py
More file actions
105 lines (87 loc) · 3.27 KB
/
encoding_4bit.py
File metadata and controls
105 lines (87 loc) · 3.27 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/usr/bin/env python3
from dataclasses import dataclass
from code_mapping import DECODE_MAPPING, ENCODE_MAPPING, ENCODING_TO_BASES, Encoding
from generic_encoding import (
DecodingError,
EncodedQuality,
EncodedSequence,
EncodingError,
bits_to_bytes,
bytes_to_bits,
)
ENCODING = Encoding.BIT4_FULL_IUPAC
TAG_BIT4 = "11"
ODD_TAG = f"{TAG_BIT4}00"
EVEN_TAG = f"{TAG_BIT4}11"
EVEN_TAG_AND_PADDING = f"{EVEN_TAG}0000"
##
## From LEFT to RIGHT
## First bit (leftmost): A included in base (included=1; not-included=0)
## A_INCLUDED=ANRWMDHV / A_NOT_INCLUDED=CGT-.YSKB
##
## Second bit: C included in base (included=1; not-included=0)
## C_INCLUDED=CNYSMBHV / C_NOT_INCLUDED=AGT-.RWKD
##
## Third bit: G included in base (included=1; not-included=0)
## G_INCLUDED=GNRSKBDV / G_NOT_INCLUDED=ACT-.YWMH
##
## Fourth bit: T included in base (included=1; not-included=0)
## T_INCLUDED=TNYWKBDH / T_NOT_INCLUDED=ACG-.RSMV
##
def encode_4bit_sequence(sequence: str) -> bytes:
"""
Layout (BIT4):
[2b TAG=11][2-bit or 6-bit padding][4-bit symbols...]
Padding will be 2-bit if sequence is odd-length, 6-bit if even-length.
"""
sequence = sequence.upper().replace("\n", "").replace("\r", "")
# Determine length and required padding
if invalid_bases := set(sequence).difference(ENCODING_TO_BASES[ENCODING]):
raise EncodingError(
f"Unsupported symbols in sequence ({sorted(invalid_bases)})",
encoding=ENCODING.value,
)
mapping = ENCODE_MAPPING[ENCODING]
data_bits = "".join(mapping[base] for base in sequence)
header = ODD_TAG if (len(sequence) % 2) else EVEN_TAG_AND_PADDING
bitstring = header + data_bits
return bits_to_bytes(bitstring)
def decode_4bit_sequence(encoded_bytes: bytes) -> str:
bits = bytes_to_bits(encoded_bytes)
# Checks if odd or even to see how much header to skip
if bits[:4] == ODD_TAG:
bits_to_skip = 4 # 2b TAG + 2b PAD
elif bits[:8] == EVEN_TAG_AND_PADDING:
bits_to_skip = 8 # 2b TAG + 6b PAD
else:
raise DecodingError(
(
f"Wrong tag in header (found {bits[:4]} or {bits[:8]},"
f" expected {ODD_TAG} or {EVEN_TAG_AND_PADDING})"
),
encoding=ENCODING.value,
)
rev = DECODE_MAPPING[ENCODING]
if len(seq_bits := bits[bits_to_skip:]) % 4 > 0:
raise DecodingError(
(
"Bitstring length after header is not divisible by 4"
f" (found length {len(seq_bits)} % 4 = {len(seq_bits) % 4})."
),
encoding=ENCODING.value,
)
return "".join(rev[seq_bits[j : j + 4]] for j in range(0, len(seq_bits), 4))
@dataclass
class Encoded4bitSequence(EncodedSequence):
"""
Represents a DNA sequence with its encoding, quality scores, and header information.
"""
encoded_sequence: bytes # The encoded sequence
encoded_quality: EncodedQuality | None = None # Quality scores as bytes (optional)
header: str | None = None # Header information (optional)
@staticmethod
def encode_sequence(sequence: str) -> bytes:
return encode_4bit_sequence(sequence)
@staticmethod
def decode_sequence(encoded_sequence: bytes) -> str:
return decode_4bit_sequence(encoded_sequence)