String-Matching-Algorithm-Analysis/benchmark.py at main · BabolokiJ/String-Matching-Algorithm-Analysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
import time
import random
import string
import matplotlib
matplotlib.use("Agg")  # non-interactive backend, no display needed
import matplotlib.pyplot as plt
import os

# benchmark comparing naive vs KMP string search
# written to understand why KMP is actually faster in practice
#
# text is randomly generated using random.seed(42) so the same strings
# are produced every run - results are string-reproducible, though timing
# will vary slightly depending on your machine
#
# run this script and it will print results to the terminal and also
# write a results.md file that the README references.

# --- naive approach ---
# just slide the pattern along the text one position at a time
# if anything doesn't match, give up and move forward
def naive_search(text, pattern):
    matches = []
    n = len(text)
    m = len(pattern)

    for i in range(n - m + 1):
        found = True
        for j in range(m):
            if text[i + j] != pattern[j]:
                found = False
                break
        if found:
            matches.append(i)

    return matches

# --- KMP ---
# before searching, build a table that tells us how far to jump back
# when we hit a mismatch, instead of starting over completely
#
# the table (LPS) stores the length of the longest prefix of the
# pattern that's also a suffix at each position

def build_lps(pattern):
    m = len(pattern)
    lps = [0] * m
    prev = 0  # length of the previous longest prefix-suffix
    i = 1

    while i < m:
        if pattern[i] == pattern[prev]:
            prev += 1
            lps[i] = prev
            i += 1
        else:
            if prev != 0:
                # try a shorter prefix before giving up
                prev = lps[prev - 1]
            else:
                lps[i] = 0
                i += 1

    return lps

def kmp_search(text, pattern):
    matches = []
    n = len(text)
    m = len(pattern)

    if m == 0:
        return matches

    lps = build_lps(pattern)
    i = 0  # position in text
    j = 0  # position in pattern

    while i < n:
        if text[i] == pattern[j]:
            i += 1
            j += 1

        if j == m:
            matches.append(i - j)
            j = lps[j - 1]  # look for next match
        elif i < n and text[i] != pattern[j]:
            if j != 0:
                j = lps[j - 1]  # jump back using the table
            else:
                i += 1  # no fallback, just move on

    return matches

# --- timing helper ---
# run it a few times and take the best, timing is noisy otherwise
def time_it(fn, text, pattern, runs=3):
    best = float('inf')
    result = None
    for _ in range(runs):
        t0 = time.perf_counter()
        result = fn(text, pattern)
        elapsed = (time.perf_counter() - t0) * 1e6  # microseconds
        if elapsed < best:
            best = elapsed
    return best, result

# returns a dict with the timing info so we can write it to results.md later
def run_test(label, text, pattern):
    naive_t, naive_out = time_it(naive_search, text, pattern)
    kmp_t, kmp_out = time_it(kmp_search, text, pattern)

    # sanity check - both should find the same matches
    if naive_out != kmp_out:
        print(f"  !! MISMATCH on '{label}' - something is broken")
        return None

    speedup = naive_t / kmp_t if kmp_t > 0 else 0
    print(f"  {label}")
    print(f"    naive: {naive_t:>9.1f} us")
    print(f"    kmp:   {kmp_t:>9.1f} us   ({speedup:.1f}x speedup, {len(naive_out)} matches)")
    print()

    return {
        "label": label,
        "naive_us": naive_t,
        "kmp_us": kmp_t,
        "speedup": speedup,
        "matches": len(naive_out),
    }

# --- correctness sanity check before benchmarking ---
def check_correctness():
    print("checking both algos agree on some known cases...")
    cases = [
        ("ABABCABABAB", "ABAB", [0, 5, 7]),
        ("AAAAAB", "AAAB", [2]),
        ("hello world", "world", [6]),
        ("AAAA", "AA", [0, 1, 2]),
        ("abcdef", "xyz", []),
        ("", "abc", []),
        ("ABCABCABC", "ABC", [0, 3, 6]),
    ]

    ok = True
    for text, pat, expected in cases:
        n = naive_search(text, pat)
        k = kmp_search(text, pat)
        if n == k == expected:
            print(f"  ok   pattern='{pat}' in '{text[:20]}'")
        else:
            print(f"  FAIL pattern='{pat}' in '{text[:20]}' -> naive={n} kmp={k} expected={expected}")
            ok = False

    print()
    return ok

def generate_graphs(sections):
    os.makedirs("graphs", exist_ok=True)
    graph_paths = []

    naive_color = "#e05c5c"
    kmp_color = "#5c8be0"

    for section_name, rows in sections:
        rows = [r for r in rows if r]
        if not rows:
            continue

        labels = [r["label"].replace("  ", "\n") for r in rows]
        naive_times = [r["naive_us"] / 1000 for r in rows]  # convert to ms
        kmp_times = [r["kmp_us"] / 1000 for r in rows]

        x = range(len(labels))
        width = 0.35

        fig, ax = plt.subplots(figsize=(8, 4.5))
        bars_n = ax.bar([i - width/2 for i in x], naive_times, width, label="naive", color=naive_color)
        bars_k = ax.bar([i + width/2 for i in x], kmp_times, width, label="kmp", color=kmp_color)

        # label each bar with the speedup
        for i, r in enumerate(rows):
            ax.text(i, max(naive_times[i], kmp_times[i]) * 1.05,
                    f"{r['speedup']:.1f}x", ha="center", va="bottom", fontsize=8, color="#444")

        ax.set_xticks(list(x))
        ax.set_xticklabels(labels, fontsize=8)
        ax.set_ylabel("time (ms)")
        ax.set_title(section_name, fontsize=10, pad=10)
        ax.legend(fontsize=9)
        ax.spines["top"].set_visible(False)
        ax.spines["right"].set_visible(False)
        ax.set_ylim(0, max(naive_times) * 1.25)

        # sanitise section name for filename
        fname = section_name.split(":")[0].strip().replace(" ", "_") + ".png"
        fpath = f"graphs/{fname}"
        fig.tight_layout()
        fig.savefig(fpath, dpi=120)
        plt.close(fig)
        graph_paths.append((section_name, fpath))
        print(f"  saved graph: {fpath}")

    # also make a speedup summary chart across all scenarios
    all_rows = [r for _, rows in sections for r in rows if r]
    if all_rows:
        labels = [r["label"].replace("  ", "\n") for r in all_rows]
        speedups = [r["speedup"] for r in all_rows]
        colors = [kmp_color if s < 5 else "#e07d2a" if s < 50 else "#c0392b" for s in speedups]

        fig, ax = plt.subplots(figsize=(10, 4.5))
        ax.bar(range(len(labels)), speedups, color=colors)
        ax.axhline(1, color="#aaa", linewidth=0.8, linestyle="--")
        ax.set_xticks(range(len(labels)))
        ax.set_xticklabels(labels, fontsize=7)
        ax.set_ylabel("speedup (kmp vs naive)")
        ax.set_title("KMP speedup across all tests", fontsize=10, pad=10)
        ax.spines["top"].set_visible(False)
        ax.spines["right"].set_visible(False)

        fpath = "graphs/speedup_summary.png"
        fig.tight_layout()
        fig.savefig(fpath, dpi=120)
        plt.close(fig)
        graph_paths.append(("speedup summary", fpath))
        print(f"  saved graph: {fpath}")

    return graph_paths

def write_results(sections, all_speedups, graph_paths=None):
    graph_map = {}
    if graph_paths:
        for name, path in graph_paths:
            graph_map[name] = path

    lines = []
    lines.append("# benchmark results\n")
    lines.append("> generated by `benchmark.py` -- re-run the script to refresh these numbers.\n")
    lines.append("> text strings are reproducible (seeded RNG), but timings vary by machine.\n")

    # speedup summary graph at the top if available
    if "speedup summary" in graph_map:
        lines.append(f"\n![speedup summary]({graph_map['speedup summary']})\n")

    for section_name, rows in sections:
        lines.append(f"\n## {section_name}\n")

        if section_name in graph_map:
            lines.append(f"![{section_name}]({graph_map[section_name]})\n")

        lines.append("| test | naive (us) | kmp (us) | speedup | matches |")
        lines.append("|------|-----------|---------|---------|---------|")
        for r in rows:
            if r:
                lines.append(
                    f"| {r['label']} | {r['naive_us']:.1f} | {r['kmp_us']:.1f} "
                    f"| {r['speedup']:.1f}x | {r['matches']} |"
                )

    lines.append("\n## summary\n")
    avg = sum(all_speedups) / len(all_speedups)
    lines.append(f"- tests run: {len(all_speedups)}")
    lines.append(f"- average speedup: {avg:.1f}x")
    lines.append(f"- best speedup: {max(all_speedups):.1f}x")
    lines.append(f"- worst speedup: {min(all_speedups):.2f}x")

    with open("results.md", "w", encoding="utf-8") as f:
        f.write("\n".join(lines) + "\n")

    print("results written to results.md")

def main():
    random.seed(42)

    if not check_correctness():
        print("correctness check failed, stopping")
        return

    all_speedups = []
    sections = []

    # scenario 1 - just random text, typical use case
    print("--- scenario 1: random text, random pattern ---\n")
    rows = []
    for tlen, plen in [(1000, 10), (10000, 100), (100000, 1000)]:
        text = ''.join(random.choices(string.ascii_lowercase, k=tlen))
        pat = ''.join(random.choices(string.ascii_lowercase, k=plen))
        r = run_test(f"text={tlen:,}  pattern={plen}", text, pat)
        rows.append(r)
        if r: all_speedups.append(r["speedup"])
    sections.append(("scenario 1: random text, random pattern", rows))

    # scenario 2 - worst case for naive
    # text is all A's, pattern is (n-1) A's then a B
    print("--- scenario 2: worst case for naive (AAAA...B) ---\n")
    rows = []
    for tlen, plen in [(1000, 50), (5000, 250), (10000, 500)]:
        text = 'A' * tlen
        pat = 'A' * (plen - 1) + 'B'
        r = run_test(f"text={tlen:,}  pattern={plen}", text, pat)
        rows.append(r)
        if r: all_speedups.append(r["speedup"])
    sections.append(("scenario 2: worst case for naive (AAAA...B)", rows))

    # scenario 3 - small alphabet like DNA (ACGT)
    # fewer distinct characters = more partial matches = more backtracking for naive
    print("--- scenario 3: DNA-style sequences (ACGT only) ---\n")
    rows = []
    for tlen in [1000, 10000, 100000]:
        text = ''.join(random.choices('ACGT', k=tlen))
        pat = ''.join(random.choices('ACGT', k=20))
        r = run_test(f"text={tlen:,}  pattern=20", text, pat)
        rows.append(r)
        if r: all_speedups.append(r["speedup"])
    sections.append(("scenario 3: DNA-style sequences (ACGT only)", rows))

    # scenario 4 - see how pattern length affects things
    print("--- scenario 4: varying pattern length (text = 50k chars) ---\n")
    rows = []
    text = ''.join(random.choices(string.ascii_lowercase, k=50000))
    for plen in [5, 20, 100, 500]:
        pat = ''.join(random.choices(string.ascii_lowercase, k=plen))
        r = run_test(f"text=50,000  pattern={plen}", text, pat)
        rows.append(r)
        if r: all_speedups.append(r["speedup"])
    sections.append(("scenario 4: varying pattern length (text=50k)", rows))

    # print summary to terminal
    avg = sum(all_speedups) / len(all_speedups)
    print(f"--- done ---")
    print(f"ran {len(all_speedups)} tests")
    print(f"average speedup: {avg:.1f}x")
    print(f"best speedup:    {max(all_speedups):.1f}x")
    print(f"worst speedup:   {min(all_speedups):.2f}x")
    print()

    print("generating graphs...")
    graph_paths = generate_graphs(sections)
    print()

    write_results(sections, all_speedups, graph_paths)

if __name__ == "__main__":
    main()