From 3e4880b6d9698735b3e9450108358b67b7b77377 Mon Sep 17 00:00:00 2001
From: Daniel Nichols <dando18studios@gmail.com>
Date: Thu, 8 May 2025 18:03:53 -0400
Subject: [PATCH 1/6] add basic ci functionality for serial cpu tests

---
 .github/workflows/serial-tests.yml | 32 ++++++++++++++++++++++++++++++
 prompts/create-serial-tests.py     |  5 +++--
 test/README.md                     |  3 +++
 test/test-cpu.bash                 | 32 ++++++++++++++++++++++++++++++
 4 files changed, 70 insertions(+), 2 deletions(-)
 create mode 100644 .github/workflows/serial-tests.yml
 create mode 100644 test/README.md
 create mode 100644 test/test-cpu.bash

diff --git a/.github/workflows/serial-tests.yml b/.github/workflows/serial-tests.yml
new file mode 100644
index 0000000..cc67055
--- /dev/null
+++ b/.github/workflows/serial-tests.yml
@@ -0,0 +1,32 @@
+name: Serial CPU Tests
+
+on:
+  push:
+    branches:
+      - develop
+  pull_request:
+    branches:
+      - develop
+
+jobs:
+  serial-tests:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        problem: [00_dense_la_lu_decomp, 01_dense_la_solve, 02_dense_la_gemm, 03_dense_la_axpy, 04_dense_la_gemv, 05_fft_inverse_fft, 06_fft_dft, 07_fft_fft_conjugate, 08_fft_split_fft, 09_fft_fft_out_of_place, 10_geometry_convex_hull, 11_geometry_convex_hull_perimeter, 12_geometry_smallest_triangle, 13_geometry_closest_pair_2d, 14_geometry_closest_pair_1d, 15_graph_edge_count, 16_graph_largest_component, 17_graph_highest_degree, 18_graph_count_components, 19_graph_shortest_path, 20_histogram_pixel_histogram, 21_histogram_bin_0-100, 22_histogram_count_quadrants, 23_histogram_first_letter_counts, 24_histogram_count_quartile, 25_reduce_xor, 26_reduce_product_of_inverses, 27_reduce_average, 28_reduce_smallest_odd_number, 29_reduce_sum_of_min_of_pairs, 30_scan_prefix_sum, 31_scan_scan_with_min_function, 32_scan_sum_of_prefix_sum_array, 33_scan_reverse_prefix_sum,, 34_scan_largest_contiguous_subarray_sum, 35_search_search_for_last_struct_by_key, 36_search_check_if_array_contains_value, 37_search_find_the_closest_number_to_pi,, 38_search_find_the_first_even_number, 39_search_xor_contains, 40_sort_sort_an_array_of_complex_numbers_by_magnitude, 41_sort_k-th_smallest_element, 42_sort_sorted_ranks, 43_sort_sort_an_array_of_structs_by_key,, 44_sort_sort_non-zero_elements, 45_sparse_la_sparse_solve, 46_sparse_la_spmm, 47_sparse_la_spmv, 48_sparse_la_sparse_axpy, 49_sparse_la_sparse_lu_decomp, 50_stencil_xor_kernel, 51_stencil_edge_kernel, 52_stencil_1d_jacobi_3-point_stencil, 53_stencil_2d_jacobi_5-point_stencil, 54_stencil_game_of_life, 55_transform_relu, 56_transform_negate_odds, 57_transform_inverse_offset, 58_transform_squaring, 59_transform_map_function]
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.x'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install tqdm
+
+      - name: Run CPU test for ${{ matrix.problem }}
+        run: bash test/test-cpu.bash "${{ matrix.problem }}"
\ No newline at end of file
diff --git a/prompts/create-serial-tests.py b/prompts/create-serial-tests.py
index 29e0adc..fdcd6c0 100644
--- a/prompts/create-serial-tests.py
+++ b/prompts/create-serial-tests.py
@@ -24,7 +24,7 @@ def get_return_type(code: str) -> str:
     # then return the type
     lines = code.split('\n')
     for line in lines:
-        if line.strip().endswith(') {'):
+        if "NO_INLINE correct" in line and line.strip().endswith(') {'):
             return line.split()[0]
 
 def main():
@@ -45,7 +45,8 @@ def main():
             continue
 
         baseline = get_file_contents(baseline_fpath)
-        impl = get_substr_after_first_of(baseline, ') {')
+        func_start = get_substr_after_first_of(baseline, 'NO_INLINE correct')
+        impl = get_substr_after_first_of(func_start, ') {')
         return_type = get_return_type(baseline)
         prompt['outputs'] = [
             impl, 
diff --git a/test/README.md b/test/README.md
new file mode 100644
index 0000000..6dba1b9
--- /dev/null
+++ b/test/README.md
@@ -0,0 +1,3 @@
+# Tests
+
+Testing of the benchmark. Currently only tests the sequential CPU capabilities.
\ No newline at end of file
diff --git a/test/test-cpu.bash b/test/test-cpu.bash
new file mode 100644
index 0000000..2699d89
--- /dev/null
+++ b/test/test-cpu.bash
@@ -0,0 +1,32 @@
+#!/bin/bash
+# Uses the baseline implementations to test the CPU capabilities of the system.
+
+# usage: bash test/test-cpu.bash <?problem>
+if [ $# -eq 0 ]; then
+    echo "No problem specified. Using default: 'all'."
+    PROBLEM_ARG=""
+else
+    PROBLEM_ARG="--problem $1"
+fi
+
+# First, use the baseline implementations to mimic LLM outputs.
+python prompts/create-serial-tests.py drivers/cpp/benchmarks prompts/generation-prompts.json serial-generations.json
+
+# make sure the model drivers are built
+cd drivers
+cd cpp
+make
+cd ..
+
+# Run the drivers using these generations
+python run-all.py \
+    ../serial-generations.json \
+    --output results.json \
+    --launch-configs launch-configs.json \
+    --problem-sizes problem-sizes.json \
+    --yes-to-all \
+    --include-models serial \
+    ${PROBLEM_ARG} \
+    --build-timeout 60 \
+    --run-timeout 120 \
+    --log info
\ No newline at end of file

From b7e88a7631fe813b7d9c67fc85b575bef6c1816e Mon Sep 17 00:00:00 2001
From: Daniel Nichols <dando18studios@gmail.com>
Date: Thu, 8 May 2025 18:06:18 -0400
Subject: [PATCH 2/6] add quotes on problem names

---
 .github/workflows/serial-tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/serial-tests.yml b/.github/workflows/serial-tests.yml
index cc67055..7645e95 100644
--- a/.github/workflows/serial-tests.yml
+++ b/.github/workflows/serial-tests.yml
@@ -13,7 +13,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        problem: [00_dense_la_lu_decomp, 01_dense_la_solve, 02_dense_la_gemm, 03_dense_la_axpy, 04_dense_la_gemv, 05_fft_inverse_fft, 06_fft_dft, 07_fft_fft_conjugate, 08_fft_split_fft, 09_fft_fft_out_of_place, 10_geometry_convex_hull, 11_geometry_convex_hull_perimeter, 12_geometry_smallest_triangle, 13_geometry_closest_pair_2d, 14_geometry_closest_pair_1d, 15_graph_edge_count, 16_graph_largest_component, 17_graph_highest_degree, 18_graph_count_components, 19_graph_shortest_path, 20_histogram_pixel_histogram, 21_histogram_bin_0-100, 22_histogram_count_quadrants, 23_histogram_first_letter_counts, 24_histogram_count_quartile, 25_reduce_xor, 26_reduce_product_of_inverses, 27_reduce_average, 28_reduce_smallest_odd_number, 29_reduce_sum_of_min_of_pairs, 30_scan_prefix_sum, 31_scan_scan_with_min_function, 32_scan_sum_of_prefix_sum_array, 33_scan_reverse_prefix_sum,, 34_scan_largest_contiguous_subarray_sum, 35_search_search_for_last_struct_by_key, 36_search_check_if_array_contains_value, 37_search_find_the_closest_number_to_pi,, 38_search_find_the_first_even_number, 39_search_xor_contains, 40_sort_sort_an_array_of_complex_numbers_by_magnitude, 41_sort_k-th_smallest_element, 42_sort_sorted_ranks, 43_sort_sort_an_array_of_structs_by_key,, 44_sort_sort_non-zero_elements, 45_sparse_la_sparse_solve, 46_sparse_la_spmm, 47_sparse_la_spmv, 48_sparse_la_sparse_axpy, 49_sparse_la_sparse_lu_decomp, 50_stencil_xor_kernel, 51_stencil_edge_kernel, 52_stencil_1d_jacobi_3-point_stencil, 53_stencil_2d_jacobi_5-point_stencil, 54_stencil_game_of_life, 55_transform_relu, 56_transform_negate_odds, 57_transform_inverse_offset, 58_transform_squaring, 59_transform_map_function]
+        problem: ["00_dense_la_lu_decomp", "01_dense_la_solve", "02_dense_la_gemm", "03_dense_la_axpy", "04_dense_la_gemv", "05_fft_inverse_fft", "06_fft_dft", "07_fft_fft_conjugate", "08_fft_split_fft", "09_fft_fft_out_of_place", "10_geometry_convex_hull", "11_geometry_convex_hull_perimeter", "12_geometry_smallest_triangle", "13_geometry_closest_pair_2d", "14_geometry_closest_pair_1d", "15_graph_edge_count", "16_graph_largest_component", "17_graph_highest_degree", "18_graph_count_components", "19_graph_shortest_path", "20_histogram_pixel_histogram", "21_histogram_bin_0-100", "22_histogram_count_quadrants", "23_histogram_first_letter_counts", "24_histogram_count_quartile", "25_reduce_xor", "26_reduce_product_of_inverses", "27_reduce_average", "28_reduce_smallest_odd_number", "29_reduce_sum_of_min_of_pairs", "30_scan_prefix_sum", "31_scan_scan_with_min_function", "32_scan_sum_of_prefix_sum_array", "33_scan_reverse_prefix_sum,", "34_scan_largest_contiguous_subarray_sum", "35_search_search_for_last_struct_by_key", "36_search_check_if_array_contains_value", "37_search_find_the_closest_number_to_pi,", "38_search_find_the_first_even_number", "39_search_xor_contains", "40_sort_sort_an_array_of_complex_numbers_by_magnitude", "41_sort_k-th_smallest_element", "42_sort_sorted_ranks", "43_sort_sort_an_array_of_structs_by_key,", "44_sort_sort_non-zero_elements", "45_sparse_la_sparse_solve", "46_sparse_la_spmm", "47_sparse_la_spmv", "48_sparse_la_sparse_axpy", "49_sparse_la_sparse_lu_decomp", "50_stencil_xor_kernel", "51_stencil_edge_kernel", "52_stencil_1d_jacobi_3-point_stencil", "53_stencil_2d_jacobi_5-point_stencil", "54_stencil_game_of_life", "55_transform_relu", "56_transform_negate_odds", "57_transform_inverse_offset", "58_transform_squaring", "59_transform_map_function"]
     steps:
       - name: Checkout repository
         uses: actions/checkout@v3

From 9d3b365d6a4ef397e2fe8d8a678b527048bceb70 Mon Sep 17 00:00:00 2001
From: Daniel Nichols <dando18studios@gmail.com>
Date: Thu, 8 May 2025 18:22:23 -0400
Subject: [PATCH 3/6] check results of driver

---
 test/test-cpu.bash            |  18 +++++-
 test/validate-test-results.py | 112 ++++++++++++++++++++++++++++++++++
 2 files changed, 127 insertions(+), 3 deletions(-)
 create mode 100644 test/validate-test-results.py

diff --git a/test/test-cpu.bash b/test/test-cpu.bash
index 2699d89..dcd8aea 100644
--- a/test/test-cpu.bash
+++ b/test/test-cpu.bash
@@ -10,7 +10,7 @@ else
 fi
 
 # First, use the baseline implementations to mimic LLM outputs.
-python prompts/create-serial-tests.py drivers/cpp/benchmarks prompts/generation-prompts.json serial-generations.json
+python3.11 prompts/create-serial-tests.py drivers/cpp/benchmarks prompts/generation-prompts.json serial-generations.json
 
 # make sure the model drivers are built
 cd drivers
@@ -19,7 +19,7 @@ make
 cd ..
 
 # Run the drivers using these generations
-python run-all.py \
+python3.11 run-all.py \
     ../serial-generations.json \
     --output results.json \
     --launch-configs launch-configs.json \
@@ -29,4 +29,16 @@ python run-all.py \
     ${PROBLEM_ARG} \
     --build-timeout 60 \
     --run-timeout 120 \
-    --log info
\ No newline at end of file
+    --log info
+
+
+# check results
+cd ..
+python3.11 test/validate-test-results.py \
+    --results drivers/results.json \
+    --problem $1 \
+    --expected-write 3 \
+    --expected-source-valid 3 \
+    --expected-build 2 \
+    --expected-run 2 \
+    --expected-correct 1
\ No newline at end of file
diff --git a/test/validate-test-results.py b/test/validate-test-results.py
new file mode 100644
index 0000000..cc191a5
--- /dev/null
+++ b/test/validate-test-results.py
@@ -0,0 +1,112 @@
+""" Checks if the expected test results are present in the output JSON file.
+    usage: python test/validate-test-results.py \
+        --results <results.json> \
+        --problem <problem_name> \
+        --expected-write <expected_write_count> \
+        --expected-source-valid <expected_source_valid_count> \
+        --expected-build <expected_build_count> \
+        --expected-run <expected_run_count> \
+        --expected-correct <expected_correct_count>
+"""
+from argparse import ArgumentParser
+import json
+from collections import Counter
+
+
+def parse_args():
+    parser = ArgumentParser(description="Validate test results.")
+    parser.add_argument(
+        "--results",
+        type=str,
+        required=True,
+        help="Path to the results JSON file.",
+    )
+    parser.add_argument(
+        "--problem",
+        type=str,
+        required=True,
+        help="Name of the problem to validate.",
+    )
+    parser.add_argument(
+        "--expected-write",
+        type=int,
+        required=True,
+        help="Expected number of write operations.",
+    )
+    parser.add_argument(
+        "--expected-source-valid",
+        type=int,
+        required=True,
+        help="Expected number of source valid operations.",
+    )
+    parser.add_argument(
+        "--expected-build",
+        type=int,
+        required=True,
+        help="Expected number of build operations.",
+    )
+    parser.add_argument(
+        "--expected-run",
+        type=int,
+        required=True,
+        help="Expected number of run operations.",
+    )
+    parser.add_argument(
+        "--expected-correct",
+        type=int,
+        required=True,
+        help="Expected number of correct operations.",
+    )
+
+    return parser.parse_args()
+
+
+def validate_outputs(outputs, expected_counts):
+    actual_counts = Counter()
+
+    for output in outputs:
+        if output.get("source_write_success", False):
+            actual_counts["write"] += 1
+        if output.get("is_source_valid", False):
+            actual_counts["source_valid"] += 1
+        if output.get("did_build", False):
+            actual_counts["build"] += 1
+        if output.get("did_all_run", False):
+            actual_counts["run"] += 1
+        if output.get("are_all_valid", False):
+            actual_counts["correct"] += 1
+
+    for key, expected in expected_counts.items():
+        actual = actual_counts[key]
+        if actual != expected:
+            print(f"Expected {expected} for {key}, but got {actual}.")
+            return False
+    return True
+
+
+def main():
+    args = parse_args()
+
+    # Load the results JSON file
+    with open(args.results, "r") as f:
+        results = json.load(f)
+
+    # Validate the results
+    expected_counts = {
+        "write": args.expected_write,
+        "source_valid": args.expected_source_valid,
+        "build": args.expected_build,
+        "run": args.expected_run,
+        "correct": args.expected_correct,
+    }
+
+    results = [r for r in results if r["name"] == args.problem][0]
+
+    if not validate_outputs(results["outputs"], expected_counts):
+        print(f"Validation failed for problem {args.problem}.")
+        return 1
+
+
+if __name__ == "__main__":
+    main()
+    
\ No newline at end of file

From f6c4110a1eb667a9c3a3ba776333bf8588fc33e1 Mon Sep 17 00:00:00 2001
From: Daniel Nichols <dando18studios@gmail.com>
Date: Thu, 8 May 2025 18:23:43 -0400
Subject: [PATCH 4/6] fix python command name

---
 test/test-cpu.bash | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/test-cpu.bash b/test/test-cpu.bash
index dcd8aea..776cc22 100644
--- a/test/test-cpu.bash
+++ b/test/test-cpu.bash
@@ -10,7 +10,7 @@ else
 fi
 
 # First, use the baseline implementations to mimic LLM outputs.
-python3.11 prompts/create-serial-tests.py drivers/cpp/benchmarks prompts/generation-prompts.json serial-generations.json
+python prompts/create-serial-tests.py drivers/cpp/benchmarks prompts/generation-prompts.json serial-generations.json
 
 # make sure the model drivers are built
 cd drivers
@@ -19,7 +19,7 @@ make
 cd ..
 
 # Run the drivers using these generations
-python3.11 run-all.py \
+python run-all.py \
     ../serial-generations.json \
     --output results.json \
     --launch-configs launch-configs.json \
@@ -34,7 +34,7 @@ python3.11 run-all.py \
 
 # check results
 cd ..
-python3.11 test/validate-test-results.py \
+python test/validate-test-results.py \
     --results drivers/results.json \
     --problem $1 \
     --expected-write 3 \

From 4a6b02637cf5ba8151178e78df18b6bdc484cf0a Mon Sep 17 00:00:00 2001
From: Daniel Nichols <dando18studios@gmail.com>
Date: Thu, 8 May 2025 18:26:20 -0400
Subject: [PATCH 5/6] fix problem name in ci file

---
 .github/workflows/serial-tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/serial-tests.yml b/.github/workflows/serial-tests.yml
index 7645e95..5505121 100644
--- a/.github/workflows/serial-tests.yml
+++ b/.github/workflows/serial-tests.yml
@@ -13,7 +13,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        problem: ["00_dense_la_lu_decomp", "01_dense_la_solve", "02_dense_la_gemm", "03_dense_la_axpy", "04_dense_la_gemv", "05_fft_inverse_fft", "06_fft_dft", "07_fft_fft_conjugate", "08_fft_split_fft", "09_fft_fft_out_of_place", "10_geometry_convex_hull", "11_geometry_convex_hull_perimeter", "12_geometry_smallest_triangle", "13_geometry_closest_pair_2d", "14_geometry_closest_pair_1d", "15_graph_edge_count", "16_graph_largest_component", "17_graph_highest_degree", "18_graph_count_components", "19_graph_shortest_path", "20_histogram_pixel_histogram", "21_histogram_bin_0-100", "22_histogram_count_quadrants", "23_histogram_first_letter_counts", "24_histogram_count_quartile", "25_reduce_xor", "26_reduce_product_of_inverses", "27_reduce_average", "28_reduce_smallest_odd_number", "29_reduce_sum_of_min_of_pairs", "30_scan_prefix_sum", "31_scan_scan_with_min_function", "32_scan_sum_of_prefix_sum_array", "33_scan_reverse_prefix_sum,", "34_scan_largest_contiguous_subarray_sum", "35_search_search_for_last_struct_by_key", "36_search_check_if_array_contains_value", "37_search_find_the_closest_number_to_pi,", "38_search_find_the_first_even_number", "39_search_xor_contains", "40_sort_sort_an_array_of_complex_numbers_by_magnitude", "41_sort_k-th_smallest_element", "42_sort_sorted_ranks", "43_sort_sort_an_array_of_structs_by_key,", "44_sort_sort_non-zero_elements", "45_sparse_la_sparse_solve", "46_sparse_la_spmm", "47_sparse_la_spmv", "48_sparse_la_sparse_axpy", "49_sparse_la_sparse_lu_decomp", "50_stencil_xor_kernel", "51_stencil_edge_kernel", "52_stencil_1d_jacobi_3-point_stencil", "53_stencil_2d_jacobi_5-point_stencil", "54_stencil_game_of_life", "55_transform_relu", "56_transform_negate_odds", "57_transform_inverse_offset", "58_transform_squaring", "59_transform_map_function"]
+        problem: ["00_dense_la_lu_decomp", "01_dense_la_solve", "02_dense_la_gemm", "03_dense_la_axpy", "04_dense_la_gemv", "05_fft_inverse_fft", "06_fft_dft", "07_fft_fft_conjugate", "08_fft_split_fft", "09_fft_fft_out_of_place", "10_geometry_convex_hull", "11_geometry_convex_hull_perimeter", "12_geometry_smallest_triangle", "13_geometry_closest_pair_2d", "14_geometry_closest_pair_1d", "15_graph_edge_count", "16_graph_largest_component", "17_graph_highest_degree", "18_graph_count_components", "19_graph_shortest_path", "20_histogram_pixel_histogram", "21_histogram_bin_0-100", "22_histogram_count_quadrants", "23_histogram_first_letter_counts", "24_histogram_count_quartile", "25_reduce_xor", "26_reduce_product_of_inverses", "27_reduce_average", "28_reduce_smallest_odd_number", "29_reduce_sum_of_min_of_pairs", "30_scan_prefix_sum", "31_scan_scan_with_min_function", "32_scan_sum_of_prefix_sum_array", "33_scan_reverse_prefix_sum", "34_scan_largest_contiguous_subarray_sum", "35_search_search_for_last_struct_by_key", "36_search_check_if_array_contains_value", "37_search_find_the_closest_number_to_pi", "38_search_find_the_first_even_number", "39_search_xor_contains", "40_sort_sort_an_array_of_complex_numbers_by_magnitude", "41_sort_k-th_smallest_element", "42_sort_sorted_ranks", "43_sort_sort_an_array_of_structs_by_key", "44_sort_sort_non-zero_elements", "45_sparse_la_sparse_solve", "46_sparse_la_spmm", "47_sparse_la_spmv", "48_sparse_la_sparse_axpy", "49_sparse_la_sparse_lu_decomp", "50_stencil_xor_kernel", "51_stencil_edge_kernel", "52_stencil_1d_jacobi_3-point_stencil", "53_stencil_2d_jacobi_5-point_stencil", "54_stencil_game_of_life", "55_transform_relu", "56_transform_negate_odds", "57_transform_inverse_offset", "58_transform_squaring", "59_transform_map_function"]
     steps:
       - name: Checkout repository
         uses: actions/checkout@v3

From ffeba536659e37b963a00b6bc067a30e3367dd45 Mon Sep 17 00:00:00 2001
From: Daniel Nichols <dando18studios@gmail.com>
Date: Thu, 8 May 2025 18:28:22 -0400
Subject: [PATCH 6/6] change name to test-serial from test-cpu

---
 .github/workflows/serial-tests.yml       | 2 +-
 test/{test-cpu.bash => test-serial.bash} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename test/{test-cpu.bash => test-serial.bash} (100%)

diff --git a/.github/workflows/serial-tests.yml b/.github/workflows/serial-tests.yml
index 5505121..bdfa1f9 100644
--- a/.github/workflows/serial-tests.yml
+++ b/.github/workflows/serial-tests.yml
@@ -29,4 +29,4 @@ jobs:
           pip install tqdm
 
       - name: Run CPU test for ${{ matrix.problem }}
-        run: bash test/test-cpu.bash "${{ matrix.problem }}"
\ No newline at end of file
+        run: bash test/test-serial.bash "${{ matrix.problem }}"
\ No newline at end of file
diff --git a/test/test-cpu.bash b/test/test-serial.bash
similarity index 100%
rename from test/test-cpu.bash
rename to test/test-serial.bash