Skip to content

Commit cb5324c

Browse files
committed
Add move-to-front transform
1 parent 456d644 commit cb5324c

1 file changed

Lines changed: 99 additions & 0 deletions

File tree

data_compression/move_to_front.py

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
"""
2+
Move-to-front transform.
3+
4+
The move-to-front transform encodes each symbol as its current index in an
5+
ordered alphabet, then moves that symbol to the front of the alphabet.
6+
It is commonly used after the Burrows-Wheeler transform in lossless
7+
compression pipelines.
8+
9+
Reference: https://en.wikipedia.org/wiki/Move-to-front_transform
10+
"""
11+
12+
13+
def _validated_alphabet(alphabet: str) -> list[str]:
14+
"""
15+
Return a mutable alphabet list after validating uniqueness.
16+
17+
>>> _validated_alphabet("abc")
18+
['a', 'b', 'c']
19+
>>> _validated_alphabet("aba")
20+
Traceback (most recent call last):
21+
...
22+
ValueError: alphabet must contain unique characters
23+
"""
24+
if not isinstance(alphabet, str):
25+
raise TypeError("alphabet must be a string")
26+
if len(set(alphabet)) != len(alphabet):
27+
raise ValueError("alphabet must contain unique characters")
28+
return list(alphabet)
29+
30+
31+
def move_to_front_encode(text: str, alphabet: str) -> list[int]:
32+
"""
33+
Encode text using the move-to-front transform.
34+
35+
>>> move_to_front_encode("banana", "abcdefghijklmnopqrstuvwxyz")
36+
[1, 1, 13, 1, 1, 1]
37+
>>> move_to_front_encode("banana", "abn")
38+
[1, 1, 2, 1, 1, 1]
39+
>>> move_to_front_encode("", "abc")
40+
[]
41+
>>> move_to_front_encode("bad", "abc")
42+
Traceback (most recent call last):
43+
...
44+
ValueError: character 'd' is not in the alphabet
45+
"""
46+
if not isinstance(text, str):
47+
raise TypeError("text must be a string")
48+
49+
symbols = _validated_alphabet(alphabet)
50+
encoded_text: list[int] = []
51+
52+
for char in text:
53+
try:
54+
char_index = symbols.index(char)
55+
except ValueError:
56+
message = f"character {char!r} is not in the alphabet"
57+
raise ValueError(message) from None
58+
encoded_text.append(char_index)
59+
symbols.insert(0, symbols.pop(char_index))
60+
61+
return encoded_text
62+
63+
64+
def move_to_front_decode(encoded_text: list[int], alphabet: str) -> str:
65+
"""
66+
Decode a move-to-front encoded list of indexes.
67+
68+
>>> move_to_front_decode([1, 1, 13, 1, 1, 1], "abcdefghijklmnopqrstuvwxyz")
69+
'banana'
70+
>>> move_to_front_decode([1, 1, 2, 1, 1, 1], "abn")
71+
'banana'
72+
>>> move_to_front_decode([], "abc")
73+
''
74+
>>> move_to_front_decode([3], "abc")
75+
Traceback (most recent call last):
76+
...
77+
ValueError: index 3 is not valid for alphabet size 3
78+
>>> move_to_front_decode([-1], "abc")
79+
Traceback (most recent call last):
80+
...
81+
ValueError: index -1 is not valid for alphabet size 3
82+
"""
83+
symbols = _validated_alphabet(alphabet)
84+
decoded_text = []
85+
86+
for index in encoded_text:
87+
if not 0 <= index < len(symbols):
88+
message = f"index {index} is not valid for alphabet size {len(symbols)}"
89+
raise ValueError(message)
90+
decoded_text.append(symbols[index])
91+
symbols.insert(0, symbols.pop(index))
92+
93+
return "".join(decoded_text)
94+
95+
96+
if __name__ == "__main__":
97+
import doctest
98+
99+
doctest.testmod()

0 commit comments

Comments
 (0)