diff --git a/.gitignore b/.gitignore index 9531f03..ac51a34 100644 --- a/.gitignore +++ b/.gitignore @@ -128,3 +128,5 @@ dmypy.json # modelset test modelset +/ecore.jsonl +/uml.jsonl diff --git a/requirements.txt b/requirements.txt index 3bd6124..cf06bc0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ scikit-learn numpy tqdm gensim==4.2.0 +datasets diff --git a/setup.py b/setup.py index fd9c07c..c07e3a5 100644 --- a/setup.py +++ b/setup.py @@ -30,6 +30,7 @@ "scikit-learn", "numpy", "tqdm", - "gensim==4.2.0" + "gensim==4.2.0", + "datasets" ] ) diff --git a/upload_hg/generate_json_files.py b/upload_hg/generate_json_files.py new file mode 100644 index 0000000..e1fb9b6 --- /dev/null +++ b/upload_hg/generate_json_files.py @@ -0,0 +1,63 @@ +import sys + +import pandas as pd + +sys.path.append("./src") + + +def main(): + from modelset import load + for model_type in ['ecore', 'uml']: + dataset = load(modeltype=model_type, selected_analysis=['stats']) + dataset_df = dataset.to_normalized_df(min_occurrences_per_category=10) + dataset_df_no_dups = dataset.to_normalized_df(remove_duplicates=True, + min_occurrences_per_category=10) + + ids = list(dataset_df['id']) + ids_no_dups = list(dataset_df_no_dups['id']) + print(f'Full dataset {len(ids)}') + print(f'No dup dataset {len(ids_no_dups)}') + labels = list(dataset_df['category']) + + txt_filenames = [dataset.txt_file(i) for i in ids] + txt_contents = [] + for f in txt_filenames: + with open(f, 'r') as file: + data = file.read() + txt_contents.append(data) + + graph_filenames = [dataset.graph_file(i) for i in ids] + graph_contents = [] + for f in graph_filenames: + with open(f, 'r') as file: + data = file.read() + graph_contents.append(data) + + # Important: Actually, if True, the model does not belong in the deduplicated version + is_dup = [True if i not in ids_no_dups else False for i in ids] + print(f'Duplicate: {len([f for f in is_dup if f==True])}') + print(f'No Duplicate: {len([f for f in is_dup if f == False])}') + + # XMI + xmi_files = [dataset.model_file(dataset.get_model_by_id(i)) for i in ids] + xmi_contents = [] + for f in xmi_files: + with open(f, 'r') as file: + data = file.read() + xmi_contents.append(data) + + final_pd = pd.DataFrame.from_dict({ + "ids": ids, + "labels": labels, + "txt": txt_contents, + "graph": graph_contents, + "xmi": xmi_contents, + "model_type": [model_type for _ in range(len(ids))], + "is_duplicated": is_dup + }) + + final_pd.to_json(f'{model_type}.jsonl', orient='records') + + +if __name__ == '__main__': + main() diff --git a/upload_hg/test_hg.py b/upload_hg/test_hg.py new file mode 100644 index 0000000..816a067 --- /dev/null +++ b/upload_hg/test_hg.py @@ -0,0 +1,7 @@ +from datasets import load_dataset +import json + +dataset_hg = load_dataset('antolin/modelset', split="train") + +print(dataset_hg) +print(dataset_hg["xmi"][0])