diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index 1a0337e..2813cbf 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -2,7 +2,7 @@ ## Project Overview -This is a Python implementation of LODA - an assembly language designed for integer sequences. The project enables reading, writing, evaluating, and generating LODA programs using machine learning techniques to discover new integer sequence programs. +This is a Python implementation of LODA - an assembly language designed for integer sequences. The project enables reading, writing, evaluating LODA programs and searching for matches in the OEIS database. ## Core Concepts @@ -15,10 +15,6 @@ This is a Python implementation of LODA - an assembly language designed for inte - **Operations**: `mov`, `add`, `sub`, `mul`, `div`, `dif`, `mod`, `pow`, `gcd`, `bin`, `cmp`, `min`, `max`, `lpb`, `lpe` - **Loops**: `lpb $n` starts loop, `lpe` ends loop (counter-based termination) -### Token Encoding for ML -Each operation becomes 3 tokens: `[operation_type, target_operand, source_operand]` -Example: `mov $1,5` → `["mov", "$1", "5"]` - ## Source Code Structure ### Core Language (`loda/lang/`) @@ -36,9 +32,8 @@ Example: `mov $1,5` → `["mov", "$1", "5"]` - **`program_cache.py`**: `ProgramCache` manages filesystem loading/caching - **`prefix_index.py`**: `PrefixIndex` enables sequence matching by prefix patterns -### Machine Learning (`loda/ml/`) +### Utilities (`loda/ml/`) - **`util.py`**: Token conversion utilities (program ↔ tokens, merging) -- **`keras/program_generation_rnn.py`**: RNN model for program generation using TensorFlow ### Mining (`loda/mine/`) - **`miner.py`**: `Miner` searches for programs matching OEIS sequences @@ -68,15 +63,6 @@ elif operand.type == OperandType.INDIRECT: value = memory[memory[operand.value]] ``` -### When working with ML tokens: -```python -# Convert programs to tokens for ML -from loda.ml.util import program_to_tokens, tokens_to_program - -tokens = program_to_tokens(program) -reconstructed = tokens_to_program(tokens) -``` - ### When working with sequences: ```python # Always specify term count and handle evaluation errors @@ -112,11 +98,11 @@ program = program_cache.get_program(sequence_id) ### Token Conversion Pattern: ```python -# ML workflow -tokens = program_to_tokens(program) -# Process with ML model -new_tokens = model.generate(tokens) -new_program = tokens_to_program(new_tokens) +# Token conversion utilities +from loda.ml.util import program_to_tokens, tokens_to_program + +tokens, vocab = program_to_tokens(program) +reconstructed = tokens_to_program(tokens) ``` ## Testing Conventions @@ -138,13 +124,11 @@ Always set appropriate limits: - Programs: `A######.asm` format (OEIS sequence numbers) - B-files: `b######.txt` format for sequence terms -- Models: Use descriptive names with hyperparameters - Use relative paths from project root ## Integration Points - OEIS database integration via sequence IDs -- TensorFlow/Keras for neural networks - File system caching for performance - CSV parsing for test data diff --git a/README.md b/README.md index d466d12..585d5b6 100644 --- a/README.md +++ b/README.md @@ -4,10 +4,8 @@ This Python package contains an implementation of the [LODA Language](https://lo an assembly language and computational model for finding integer sequence programs. This Python package allows you to read and write LODA programs, to evaluate -them to integer sequences, to search for matches in the -[OEIS](https://www.oeis.org/) database, -and to use machine learning tools from [Tensorflow](https://www.tensorflow.org/) -to find new integer sequence programs. +them to integer sequences, and to search for matches in the +[OEIS](https://www.oeis.org/) database. ## Getting Started diff --git a/generate.py b/generate.py deleted file mode 100644 index 73f2bd3..0000000 --- a/generate.py +++ /dev/null @@ -1,60 +0,0 @@ -import argparse -import datetime -import os.path -import sys - -from loda.lang import Program -from loda.ml.keras.program_generation_rnn import load_model, Generator - - -def eprint(*args, **kwargs): - print(*args, file=sys.stderr, **kwargs) - - -def generate_programs(generator, num_programs: int, use_line_format: bool, write_fn, verbose=0): - for i in range(num_programs): - p = generator() - if use_line_format: - p = "; ".join([str(op) for op in p.operations]) - write_fn("{}\n".format(p)) - if verbose > 0 and i % 10 == 0: - ct = datetime.datetime.now() - eprint(ct, generator.get_stats_info_str()) - - -def generate(model_path: str, output_path=None, num_programs=100, format="asm", verbose=0): - model = load_model(model_path) - if verbose > 0: - model.summary(print_fn=eprint) - initial_program = Program() - # initial_program.operations.append(Operation("mov $1,1")) - num_lanes = 10 - if num_programs >= 1000: - num_lanes = 100 - elif num_programs >= 10000: - num_lanes = 1000 - generator = Generator( - model, initial_program=initial_program, num_lanes=100) - use_line_format = (format == "line") - if output_path: - with open(output_path, "w") as file: - generate_programs(generator, num_programs, - use_line_format, file.write, verbose) - else: - generate_programs(generator, num_programs, - use_line_format, sys.stdout.write, verbose) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("model", type=str) - parser.add_argument( - "-f", "--format", type=str, choices=["asm", "line"], help="output format of the generated programs") - parser.add_argument( - "-o", "--output", type=str, help="output file for writing the programs to") - parser.add_argument( - "-n", type=int, help="number of programs to generate", default=100) - parser.add_argument('-v', '--verbose', action='count', default=0) - args = parser.parse_args() - generate(model_path=args.model, output_path=args.output, - num_programs=args.n, format=args.format, verbose=args.verbose) diff --git a/loda/documentation.md b/loda/documentation.md index 48b0ec3..eb22e4e 100644 --- a/loda/documentation.md +++ b/loda/documentation.md @@ -1,8 +1,6 @@ This Python package allows you to read and write LODA programs, to evaluate -them to integer sequences, to search for matches in the -[OEIS](https://www.oeis.org/) database, -and to use machine learning from [Tensorflow](https://www.tensorflow.org/) -to generate new integer sequence programs. +them to integer sequences, and to search for matches in the +[OEIS](https://www.oeis.org/) database. ## Installation diff --git a/loda/ml/keras/__init__.py b/loda/ml/keras/__init__.py deleted file mode 100644 index 94c300c..0000000 --- a/loda/ml/keras/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Machine learning using Tensorflow/Keras.""" diff --git a/loda/ml/keras/program_generation_rnn.py b/loda/ml/keras/program_generation_rnn.py deleted file mode 100644 index b61603d..0000000 --- a/loda/ml/keras/program_generation_rnn.py +++ /dev/null @@ -1,335 +0,0 @@ -""" -Keras RNN models for LODA program generation. - -## Example - ->>> # Train a model using existing programs: ->>> program_cache = ProgramCache("path/to/programs") ->>> model = train_model(program_cache) ->>> ->>> # Save a model to disk: ->>> model.save("sample_model") ->>> ->>> # Load a model from disk: ->>> model = load_model("sample_model") ->>> ->>> # Generated program using the model: ->>> generator = Generator(model) ->>> program = generator() -""" - -import copy -import time - -from loda.lang import Operation, Program -from loda.oeis import ProgramCache -from loda.ml import util - -import tensorflow as tf - - -class Model(tf.keras.Model): - """Keras model for program generation using RNN.""" - - def __init__(self, vocabulary: list, - embedding_dim: int, num_rnn_units: int, - num_samples: int, sample_size: int, - num_ops_per_sample: int, num_nops_separator: int, - program_ids: list): - - super().__init__(self) - self.vocabulary = vocabulary - self.embedding_dim = embedding_dim - self.num_rnn_units = num_rnn_units - self.num_samples = num_samples - self.sample_size = sample_size - self.num_ops_per_sample = num_ops_per_sample - self.num_nops_separator = num_nops_separator - self.program_ids = program_ids - - # Initialize token <-> ID lookup layers. - self.tokens_to_ids = tf.keras.layers.StringLookup( - vocabulary=vocabulary, mask_token=None) - self.ids_to_tokens = tf.keras.layers.StringLookup( - vocabulary=self.tokens_to_ids.get_vocabulary(), invert=True, mask_token=None) - vocab_size = self.get_vocab_size() - - # Create the processing layers. - self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim) - self.gru = tf.keras.layers.GRU(num_rnn_units, - return_sequences=True, - return_state=True) - self.dense = tf.keras.layers.Dense(vocab_size) - - def get_vocab_size(self): - return len(self.tokens_to_ids.get_vocabulary()) - - def call(self, inputs, states=None, return_state=False, training=False): - values = inputs - values = self.embedding(values, training=training) - if states is None: - states = self.gru.get_initial_state(values) - values, states = self.gru( - values, initial_state=states, training=training) - values = self.dense(values, training=training) - if return_state: - return values, states - else: - return values - - def get_config(self): - return {"vocabulary": self.vocabulary, - "embedding_dim": self.embedding_dim, - "num_rnn_units": self.num_rnn_units, - "num_samples": self.num_samples, - "sample_size": self.sample_size, - "num_ops_per_sample": self.num_ops_per_sample, - "num_nops_separator": self.num_nops_separator, - "program_ids": self.program_ids} - - def summary(self, line_length=None, positions=None, print_fn=None, - expand_nested=False, show_trainable=False, layer_range=None): - super().summary(line_length, positions, print_fn, - expand_nested, show_trainable, layer_range) - print("Vocabulary size:", self.get_vocab_size()) - print("Sample size:", self.sample_size) - print("Trained samples:", self.num_samples) - print("Trained programs:", len(self.program_ids)) - print("Operation per sample:", self.num_ops_per_sample) - print("Nop separators:", self.num_nops_separator) - - @classmethod - def from_config(cls, config): - return cls(**config) - - -class Generator: - - def __init__(self, model: Model, initial_program: Program = Program(), num_lanes: int = 1, temperature: float = 1.0): - """ - Program generator based on a previously trained RNN model. - - Args: - model: Previously trained or loaded `Model`. - initial_program: Program to initialize the generation. This can be empty. - num_lanes: Number of parallel lanes to use for program generation. Using more lanes - potentially increases the program generation performance, but also the memory usage. - temperature: Controls the randomness of the generated programs. - """ - # Store members: - self.model = model - self.num_lanes = num_lanes - self.__temperature = temperature - # Prepare inputs and states: - initial_program = self.__prepare_initial_program(initial_program) - self.inputs = self.__program_to_input_ids(initial_program) - self.states = None - # Prepare lanes: - self.token_lanes = [] - self.program_lanes = [] - for _ in range(self.num_lanes): - self.token_lanes.append([]) - self.program_lanes.append(Program()) - # Prepare program queue: - self.program_queue = [] - # Statistics: - self.num_generated_programs = 0 - self.num_generated_tokens = 0 - self.num_generated_operations = 0 - self.num_generated_nops = 0 - self.num_token_errors = 0 - self.num_program_errors = 0 - self.start_time = time.time() - - def __call__(self) -> Program: - """Generate a program.""" - while len(self.program_queue) == 0: - self.__generate_programs() - return self.program_queue.pop() - - def __ids_to_tokens_str(self, ids) -> list: - return [t.numpy().decode("utf-8") for t in self.model.ids_to_tokens(ids)] - - def __prepare_initial_program(self, program: Program) -> Program: - initial = copy.deepcopy(program) - diff_sample_size = len(initial.operations) - \ - self.model.num_ops_per_sample - if diff_sample_size > 0: - initial.operations = initial.operations[diff_sample_size:] - elif diff_sample_size < 0: - tmp_program = Program() - util.append_nops(tmp_program, -diff_sample_size) - tmp_program.operations.extend(initial.operations) - initial = tmp_program - return initial - - def __program_to_input_ids(self, program: Program): - tokens, _ = util.program_to_tokens(program) - ids = self.model.tokens_to_ids(tokens).numpy() - return tf.constant([ids] * self.num_lanes) - - def __generate_ids(self): - - # Execute the model. - predicted_logits, states = self.model(inputs=self.inputs, - states=self.states, - return_state=True) - # Only use the last prediction. - predicted_logits = predicted_logits[:, -1, :] - predicted_logits = predicted_logits/self.__temperature - - # Sample the output logits to generate token IDs. - self.inputs = tf.random.categorical(predicted_logits, num_samples=1) - self.states = states - - def __generate_tokens(self): - self.__generate_ids() - next_tokens = self.__ids_to_tokens_str( - tf.squeeze(self.inputs, axis=-1)) - # print("TOKENS: {}".format(next_tokens)) - for i in range(self.num_lanes): - self.token_lanes[i].append(next_tokens[i]) - self.num_generated_tokens += self.num_lanes - - def __generate_operations(self): - # Generate three tokens for one operation: - self.__generate_tokens() - self.__generate_tokens() - self.__generate_tokens() - operations = [] - for i in range(self.num_lanes): - op = util.tokens_to_operation(self.token_lanes[i], 0) - while op is None: - self.num_token_errors += 1 - self.token_lanes[i].pop(0) - self.__generate_tokens() - op = util.tokens_to_operation(self.token_lanes[i], 0) - self.token_lanes[i].pop(0) - self.token_lanes[i].pop(0) - self.token_lanes[i].pop(0) - operations.append(op) - self.num_generated_operations += self.num_lanes - return operations - - def __generate_programs(self): - operations = self.__generate_operations() - for i in range(self.num_lanes): - if operations[i].type == Operation.Type.NOP: - self.num_generated_nops += 1 - if len(self.program_lanes[i].operations) > 0: - try: - self.program_lanes[i].validate() - self.program_queue.append(self.program_lanes[i]) - except Exception as e: - # print("PRORGRAM ERROR:", e) - # print(program_lanes[i]) - self.num_program_errors += 1 - self.program_lanes[i] = Program() - self.num_generated_programs += 1 - else: - self.program_lanes[i].operations.append(operations[i]) - - def get_stats_info_str(self) -> str: - """ - Returns an info string containing useful stats about this generator including - the number of generated programs, the generation speed, and error statistics. - - Example output: - ```text - generated programs: 233, speed: 17.43 programs/s, token errors: 0.03%, program errors: 6.01%, separator overhead: -0.40% - ``` - """ - separator_overhead = ( - self.num_generated_nops / (self.num_generated_programs * self.model.num_nops_separator)) - 1 - return "generated programs: {}, speed: {:.2f} programs/s, token errors: {:.2f}%, program errors: {:.2f}%, separator overhead: {:.2f}%".format( - self.num_generated_programs, - self.num_generated_programs / (time.time() - self.start_time), - 100 * self.num_token_errors / self.num_generated_tokens, - 100 * self.num_program_errors / self.num_generated_programs, - 100 * separator_overhead) - - -def __create_dataset(ids: list, sample_size: int, - batch_size: int = 128, buffer_size: int = 10000): - - # Basic tensor dataset. - slice_dataset = tf.data.Dataset.from_tensor_slices(ids) - - # We repeat the original dataset to make sure we sample at all - # possible start positions. We made sure already before that the - # the dataset size mod the sample size is +/-1. So this works! - # Note also that we don't need to enable drop_remainder here. - batch_dataset = slice_dataset.repeat(sample_size).batch(sample_size) - - # Split the samples into (input,label) pairs. - split_dataset = batch_dataset.map(util.split_sample) - - # Shuffle dataset. - prefetch_dataset = (split_dataset.shuffle(buffer_size).batch( - batch_size).prefetch(tf.data.experimental.AUTOTUNE)) - - return prefetch_dataset - - -def load_model(model_path: str) -> Model: - """ - Load a Keras RNN Model for program generation. - - The model should have been generated using `train_model` and saved before. - - Args: - model_path: File system path to the model to be loaded. - Return: - Returns the loaded `Model`. - """ - return tf.keras.models.load_model(model_path, custom_objects={"Model": Model}) - - -def train_model(program_cache: ProgramCache, num_programs: int = -1, - num_ops_per_sample: int = 32, num_nops_separator: int = 24, - embedding_dim: int = 256, num_rnn_units: int = 1024, - epochs: int = 3): - """ - Train a Keras RNN model for program generation. - - Args: - program_cache: Program cache that contains the programs used for training the model. - num_programs: Number of programs used for training (-1 for all available programs). - num_ops_per_sample: Number of operations per sample. We recommend to set this approximately - to the length of the longest loops in the training programs. This enables the model - to learn the structure of closed program loops and avoid generation of broken loops. - num_nops_separator: Number of `nop` operations used as separator between trained programs. - We recommend to set this to 75% of `num_ops_per_sample`, but at least 1. - embedding_dim: Embedding dimensions. - num_rnn_units: Number of RNN units. - epochs: Number of epochs for training. - - Return: - This function returns the trained Keras model. - """ - # Get random program IDs. - program_ids = util.get_random_program_ids(program_cache, num_programs) - - # Load programs and convert to tokens and vocabulary. - merged_programs, num_samples, sample_size = util.merge_programs( - program_cache, program_ids, - num_ops_per_sample=num_ops_per_sample, - num_nops_separator=num_nops_separator) - tokens, vocabulary = util.program_to_tokens(merged_programs) - - # Create Keras model and dataset, run the training, and save the model. - program_ids = sorted(program_ids) - model = Model(vocabulary, - embedding_dim=embedding_dim, - num_rnn_units=num_rnn_units, - num_samples=num_samples, - sample_size=sample_size, - num_ops_per_sample=num_ops_per_sample, - num_nops_separator=num_nops_separator, - program_ids=program_ids) - ids = model.tokens_to_ids(tokens) - dataset = __create_dataset(ids, sample_size=sample_size) - loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True) - model.compile(optimizer="adam", loss=loss) - model.fit(dataset, epochs=epochs) - return model diff --git a/requirements.txt b/requirements.txt index eb7c945..bbeb417 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,5 +2,3 @@ argparse nose2 parameterized requests -tensorflow; sys_platform != 'darwin' -tensorflow-macos; sys_platform == 'darwin' diff --git a/sample.py b/sample.py index 219edf1..6fe68ae 100644 --- a/sample.py +++ b/sample.py @@ -4,7 +4,6 @@ from loda.oeis import ProgramCache, Sequence from loda.runtime import Evaluator, Interpreter from loda.mine import Miner -from loda.ml.keras.program_generation_rnn import load_model, train_model, Generator class SampleLODA: diff --git a/tests/test_ml.py b/tests/test_ml.py index c527db4..a801c8b 100644 --- a/tests/test_ml.py +++ b/tests/test_ml.py @@ -1,28 +1,12 @@ # -*- coding: utf-8 -*- from unittest import TestCase -from loda.ml.keras.program_generation_rnn import * from loda.oeis import ProgramCache from loda.ml import util from tests.helpers import PROGRAMS_TEST_DIR -#class ProgramGenerationRNNTests(TestCase): -# -# def setUp(self): -# self.program_cache = ProgramCache(PROGRAMS_TEST_DIR) -# -# def test_model(self): -# model = train_model(self.program_cache) -# model.save("test_model") -# loaded = load_model("test_model") -# loaded.summary() -# generator = Generator(loaded, num_lanes=10) -# for _ in range(10): -# generator() - - class UtilTests(TestCase): def setUp(self): diff --git a/train.py b/train.py deleted file mode 100644 index bac61dc..0000000 --- a/train.py +++ /dev/null @@ -1,23 +0,0 @@ -import os.path - -from loda.oeis import ProgramCache -from loda.ml.keras.program_generation_rnn import train_model - - -def train(programs_percentage: int): - programs_dir = os.path.expanduser("~/loda/programs/oeis") - program_cache = ProgramCache(programs_dir) - num_train_programs = -1 - if programs_percentage < 100: - num_total_programs = len(program_cache.all_ids()) - num_train_programs = (programs_percentage * num_total_programs) // 100 - print("Training using {} programs".format(num_train_programs)) - model = train_model(program_cache, num_programs=num_train_programs) - model.save("model-{:03}".format(programs_percentage)) - - -if __name__ == "__main__": - train(1) - train(25) - train(50) - train(100)