From 9c23df9cd819cb881c6757fe5241735a8b17750f Mon Sep 17 00:00:00 2001 From: Theo Date: Fri, 23 May 2025 19:35:51 +0200 Subject: [PATCH 01/10] Adding a glob option to load_from_config --- superstyl/load_from_config.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/superstyl/load_from_config.py b/superstyl/load_from_config.py index 4b66c445..103dd853 100644 --- a/superstyl/load_from_config.py +++ b/superstyl/load_from_config.py @@ -2,6 +2,7 @@ import superstyl import pandas as pd import os +import glob from superstyl.load import load_corpus @@ -28,11 +29,25 @@ def load_corpus_from_config(config_path): config = json.load(f) # Get corpus paths + if 'paths' in config: if isinstance(config['paths'], list): - paths = config['paths'] + paths = [] + for path in config['paths']: + if '*' in path or '?' in path or '[' in path: + expanded_paths = glob.glob(path) + if not expanded_paths: + print(f"Warning: No files found for pattern '{path}'") + paths.extend(expanded_paths) + else: + paths.append(path) elif isinstance(config['paths'], str): - paths = [config['paths']] + if '*' in config['paths'] or '?' in config['paths'] or '[' in config['paths']: + paths = glob.glob(config['paths']) + if not paths: + raise ValueError(f"No files found for glob pattern '{config['paths']}'") + else: + paths = [config['paths']] else: raise ValueError("Paths in config must be either a list or a glob pattern string") else: @@ -83,7 +98,7 @@ def load_corpus_from_config(config_path): 'neighbouring_size': feature_config.get('neighbouring_size', 10), 'culling': feature_config.get('culling', 0) } - + print(f"Loading corpus with {feature_name}...") corpus, features = load_corpus(paths, feat_list=feat_list, **params) From 57caa7236958a82aad4ac2665f5ba746296d4534 Mon Sep 17 00:00:00 2001 From: Theo Date: Wed, 4 Jun 2025 19:25:27 +0200 Subject: [PATCH 02/10] Add file to debug --- superstyl/load.py | 1 - superstyl/load_from_config.py | 26 ++++++++++++++++++-------- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/superstyl/load.py b/superstyl/load.py index 4528c6f6..edc8232e 100644 --- a/superstyl/load.py +++ b/superstyl/load.py @@ -81,7 +81,6 @@ def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, freqsTyp my_feats = [m[0] for m in feat_list] # keeping only the features without the frequencies myTexts = fex.get_counts(myTexts, feat_list=my_feats, feats=feats, n=n, freqsType=freqsType) - if embedding: print(".......embedding counts.......") myTexts, my_feats = embed.get_embedded_counts(myTexts, my_feats, model, topn=neighbouring_size) diff --git a/superstyl/load_from_config.py b/superstyl/load_from_config.py index 103dd853..9bf433fb 100644 --- a/superstyl/load_from_config.py +++ b/superstyl/load_from_config.py @@ -55,7 +55,7 @@ def load_corpus_from_config(config_path): # Get sampling parameters sampling_params = config.get('sampling', {}) - + # Use the first feature to create the base corpus with sampling feature_configs = config.get('features', []) if not feature_configs: @@ -89,7 +89,7 @@ def load_corpus_from_config(config_path): 'size': sampling_params.get('sample_size', 3000), 'step': sampling_params.get('sample_step', None), 'max_samples': sampling_params.get('max_samples', None), - 'samples_random': sampling_params.get('sample_random', False), + 'samples_random': sampling_params.get('samples_random', False), 'keep_punct': feature_config.get('keep_punct', False), 'keep_sym': feature_config.get('keep_sym', False), 'no_ascii': feature_config.get('no_ascii', False), @@ -115,6 +115,7 @@ def load_corpus_from_config(config_path): # Check for feature list file feat_list = None feat_list_path = feature_config.get('feat_list') + print(feat_list_path) if feat_list_path: if feat_list_path.endswith('.json'): with open(feat_list_path, 'r') as f: @@ -135,7 +136,7 @@ def load_corpus_from_config(config_path): 'size': sampling_params.get('sample_size', 3000), 'step': sampling_params.get('sample_step', None), 'max_samples': sampling_params.get('max_samples', None), - 'samples_random': sampling_params.get('sample_random', False), + 'samples_random': sampling_params.get('samples_random', False), 'keep_punct': config.get('keep_punct', False), 'keep_sym': config.get('keep_sym', False), 'no_ascii': config.get('no_ascii', False), @@ -146,11 +147,17 @@ def load_corpus_from_config(config_path): } print(f"Loading {feature_name}...") + corpus, features = load_corpus(paths, feat_list=feat_list, **params) # Store corpus and features corpora[feature_name] = corpus - feature_lists[feature_name] = features + + if feat_list is not None: + feature_lists[feature_name] = feat_list + else: + feature_lists[feature_name] = features + print(len(feature_lists[feature_name])) # Create a merged dataset print("Creating merged dataset...") @@ -170,19 +177,22 @@ def load_corpus_from_config(config_path): # Add features from each corpus for name, corpus in corpora.items(): + single_feature = [] + feature_cols = [col for col in corpus.columns if col not in ['author', 'lang']] # Rename columns to avoid duplicates - renamed_cols = {col: f"{name}_{col}" for col in feature_cols} - feature_df = corpus[feature_cols].rename(columns=renamed_cols) + #renamed_cols = {col: col for col in feature_cols} + feature_df = corpus[feature_cols]#.rename(columns=renamed_cols) # Merge with the main DataFrame merged = pd.concat([merged, feature_df], axis=1) # Add features to the combined list with prefixes - for feature in feature_lists[name]: - all_features.append((f"{name}_{feature[0]}", feature[1])) + for feature in corpus.columns:#feature_lists[name]: + single_feature.append((feature, 0))#[0], feature[1])) + all_features.append(single_feature) # Return the merged corpus and combined feature list return merged, all_features From 984770952b5795bd944cb07bfc11285b73952d37 Mon Sep 17 00:00:00 2001 From: Theo Date: Thu, 20 Nov 2025 14:01:36 +0100 Subject: [PATCH 03/10] Change load from config --- superstyl/load_from_config.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/superstyl/load_from_config.py b/superstyl/load_from_config.py index 9bf433fb..5dc7d3a6 100644 --- a/superstyl/load_from_config.py +++ b/superstyl/load_from_config.py @@ -6,7 +6,7 @@ from superstyl.load import load_corpus -def load_corpus_from_config(config_path): +def load_corpus_from_config(config_path, is_test=False): """ Load a corpus based on a JSON configuration file. @@ -87,7 +87,7 @@ def load_corpus_from_config(config_path): 'sampling': sampling_params.get('enabled', False), 'units': sampling_params.get('units', 'words'), 'size': sampling_params.get('sample_size', 3000), - 'step': sampling_params.get('sample_step', None), + 'step': sampling_params.get('step', None), 'max_samples': sampling_params.get('max_samples', None), 'samples_random': sampling_params.get('samples_random', False), 'keep_punct': feature_config.get('keep_punct', False), @@ -134,7 +134,7 @@ def load_corpus_from_config(config_path): 'sampling': sampling_params.get('enabled', False), 'units': sampling_params.get('units', 'words'), 'size': sampling_params.get('sample_size', 3000), - 'step': sampling_params.get('sample_step', None), + 'step': sampling_params.get('step', None), 'max_samples': sampling_params.get('max_samples', None), 'samples_random': sampling_params.get('samples_random', False), 'keep_punct': config.get('keep_punct', False), @@ -153,11 +153,11 @@ def load_corpus_from_config(config_path): # Store corpus and features corpora[feature_name] = corpus - if feat_list is not None: + if feat_list is not None and is_test: feature_lists[feature_name] = feat_list else: feature_lists[feature_name] = features - print(len(feature_lists[feature_name])) + # Create a merged dataset print("Creating merged dataset...") @@ -182,15 +182,15 @@ def load_corpus_from_config(config_path): feature_cols = [col for col in corpus.columns if col not in ['author', 'lang']] # Rename columns to avoid duplicates - #renamed_cols = {col: col for col in feature_cols} - feature_df = corpus[feature_cols]#.rename(columns=renamed_cols) + renamed_cols = {col: col for col in feature_cols} + feature_df = corpus[feature_cols].rename(columns=renamed_cols) # Merge with the main DataFrame merged = pd.concat([merged, feature_df], axis=1) # Add features to the combined list with prefixes - for feature in corpus.columns:#feature_lists[name]: - single_feature.append((feature, 0))#[0], feature[1])) + for feature in feature_lists[name]: + single_feature.append((feature[0], feature[1])) all_features.append(single_feature) # Return the merged corpus and combined feature list From 6fc2f21fbaf8b88d8bf1911cd1aad8103620fe48 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Camps Date: Thu, 20 Nov 2025 14:04:16 +0100 Subject: [PATCH 04/10] correct Florians bug --- superstyl/svm.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/superstyl/svm.py b/superstyl/svm.py index 204987a5..abceb823 100755 --- a/superstyl/svm.py +++ b/superstyl/svm.py @@ -288,13 +288,13 @@ def plot_coefficients(coefs, feature_names, current_class, top_features=10): -def plot_rolling(final_predictions, smoothing=3): +def plot_rolling(final_predictions, smoothing=3, xlab = "Index (segment center)"): """ Plots the rolling stylometry results as lines of decision function values over the text. Parameters: - final_predictions_path : str - Path to the CSV file containing final predictions generated by the SVM pipeline. + final_predictions : Pandas dataframe containing the final predictions out of train_svm + . smoothing : int or None The window size for smoothing the curves. @@ -304,24 +304,25 @@ def plot_rolling(final_predictions, smoothing=3): """ # Extract the segment center from the filename + my_final_predictions = final_predictions.copy() # to avoid modifying in place segment_centers = [] - for fname in final_predictions['filename']: + for fname in my_final_predictions['filename']: parts = fname.split('_')[-1].split('-') start = int(parts[0]) end = int(parts[1]) center = (start + end) / 2.0 segment_centers.append(center) - final_predictions['segment_center'] = segment_centers + my_final_predictions['segment_center'] = segment_centers - final_predictions['filename'] = [fname.split('_')[1] for fname in final_predictions['filename']] + my_final_predictions['filename'] = [fname.split('_')[1] for fname in my_final_predictions['filename']] # Identify candidate columns known_cols = {'filename', 'author', 'segment_center'} - candidate_cols = [c for c in final_predictions.columns if c not in known_cols] + candidate_cols = [c for c in my_final_predictions.columns if c not in known_cols] - for work in final_predictions['filename'].unique(): - fpreds_work = final_predictions[final_predictions['filename'] == work] + for work in my_final_predictions['filename'].unique(): + fpreds_work = my_final_predictions[my_final_predictions['filename'] == work] # Sort by segment center to ensure chronological order fpreds_work = fpreds_work.sort_values('segment_center') @@ -336,7 +337,7 @@ def plot_rolling(final_predictions, smoothing=3): plt.plot(fpreds_work['segment_center'], fpreds_work[col], label=col, linewidth=2) plt.title('Rolling Stylometry Decision Functions Over ' + work) - plt.xlabel('Word index (segment center)') + plt.xlabel(xlab) plt.ylabel('Decision Function Value') plt.ylim(min(-2, min(fpreds_work[candidate_cols].min()) - 0.2), max(1, max(fpreds_work[candidate_cols].max())) + 0.2) From 71461061b8cec5ed379a4b46e3b34d8288631540 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Camps Date: Thu, 20 Nov 2025 14:46:44 +0100 Subject: [PATCH 05/10] fixing this lemma issue (bye bye pos for english) --- superstyl/load.py | 15 +- superstyl/preproc/features_extract.py | 28 +-- superstyl/preproc/pipe.py | 251 ++++++++++++++------------ 3 files changed, 157 insertions(+), 137 deletions(-) diff --git a/superstyl/load.py b/superstyl/load.py index edc8232e..7c50a790 100644 --- a/superstyl/load.py +++ b/superstyl/load.py @@ -14,10 +14,11 @@ def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, freqsTyp Main function to load a corpus from a collection of file, and an optional list of features to extract. :param data_paths: paths to the source files :param feat_list: an optional list of features (as created by load_corpus), default None - :param feats: the type of features, one of 'words', 'chars', 'affixes, and 'POS'. Affixes are inspired by - Sapkota et al. 2015, and include space_prefix, space_suffix, prefix, suffix, and, if keep_pos, punctuation n-grams. - POS are currently only implemented for Modern English - TODO: add met_line, met_syll + :param feats: the type of features, one of 'words', 'chars', 'affixes, 'lemma', 'pos', 'met_line' and 'met_syll'. + Affixes are inspired by Sapkota et al. 2015, and include space_prefix, space_suffix, prefix, suffix, and, + if keep_punct, punctuation n-grams. From TEI, pos, lemma, met_line or met_syll can + be extracted; met_line is the prosodic (stress) annotation of a full verse; met_syll is a char n-gram of prosodic + annotation :param n: n grams lengths (default 1) :param k: How many most frequent? The function takes the rank of k (if k is smaller than the total number of features), gets its frequencies, and only include features of superior or equal total frequencies. @@ -45,6 +46,12 @@ def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, freqsTyp :return a pandas dataFrame of text metadata and feature frequencies; a global list of features with their frequencies """ + if feats in ('lemma', 'pos', 'met_line', 'met_syll') and format is not 'tei': + raise ValueError("lemma, pos, met_line or met_syll are only possible with adequate tei format (@lemma, @pos, @met)") + + if feats in ('met_line', 'met_syll') and units is not 'lines': + raise ValueError("met_line or met_syll are only possible with tei format that includes lines and @met") + embeddedFreqs = False if embedding: print(".......loading embedding.......") diff --git a/superstyl/preproc/features_extract.py b/superstyl/preproc/features_extract.py index fe6ba5c0..3ef055d3 100755 --- a/superstyl/preproc/features_extract.py +++ b/superstyl/preproc/features_extract.py @@ -20,9 +20,9 @@ def count_features(text, feats ="words", n = 1): raise ValueError("Text cannot be empty.") if n < 1 or not isinstance(n, int): raise ValueError("n must be a positive integer.") - if feats not in ["words", "chars", "affixes", "pos", "met_line", "met_syll"]: - raise ValueError("Unsupported feature type. Choose from 'words', 'chars', 'affixes', 'met_line', 'met_syll', or 'pos'.") - if feats == "words": + if feats not in ["words", "chars", "affixes", "lemmas", "pos", "met_line", "met_syll"]: + raise ValueError("Unsupported feature type. Choose from 'words', 'chars', 'affixes', 'met_line', 'met_syll', 'lemmas' or 'pos'.") + if feats in ("words", "lemmas", "pos"): tokens = nltk.tokenize.wordpunct_tokenize(text) if n > 1: tokens = ["_".join(t) for t in list(nltk.ngrams(tokens, n))] @@ -46,20 +46,6 @@ def count_features(text, feats ="words", n = 1): ] tokens = affs + space_affs_and_punct - #POS in english with NLTK - need to propose spacy later on - elif feats == "pos": - try: - nltk.data.find('taggers/averaged_perceptron_tagger_eng') - except: - nltk.download('averaged_perceptron_tagger_eng') - words = nltk.tokenize.wordpunct_tokenize(text) - pos_tags = [pos for word, pos in nltk.pos_tag(words)] - if n > 1: - tokens = ["_".join(t) for t in list(nltk.ngrams(pos_tags, n))] - else: - tokens = pos_tags - total = len(tokens) - elif feats == "met_line": tokens = text.split() if n > 1: @@ -73,7 +59,7 @@ def count_features(text, feats ="words", n = 1): #Adding an error message in case some distracted guy like me would enter something wrong: else: - raise ValueError("Unsupported feature type. Choose from 'words', 'chars', 'affixes', 'met_line', 'met_syll' or 'pos'.") + raise ValueError("Unsupported feature type. Choose from 'words', 'chars', 'affixes', 'met_line', 'met_syll', 'lemmas' or 'pos'.") counts = Counter() counts.update(tokens) @@ -108,7 +94,7 @@ def get_feature_list(myTexts, feats="words", n=1, freqsType="relative"): """ :param myTexts: a 'myTexts' object, containing documents to be processed :param feat_list: a list of features to be selected - :param feats: type of feats (words, chars, affixes, POS, met_line, or met_syll) + :param feats: the type of features, one of 'words', 'chars', 'affixes, 'lemma', 'pos', 'met_line' and 'met_syll'. :param freqsType: "relative", "absolute" or "binary" frequencies :param n: n-grams length :return: list of features, with total frequency @@ -142,14 +128,12 @@ def get_doc_frequency(myTexts): return feats_doc_freq - - def get_counts(myTexts, feat_list=None, feats = "words", n = 1, freqsType = "relative"): """ Get counts for a collection of texts :param myTexts: the document collection :param feat_list: a list of features to be selected (None for all) - :param feats: the type of feats (words, chars, affixes, POS) + :param feats: the type of features, one of 'words', 'chars', 'affixes, 'lemma', 'pos', 'met_line' and 'met_syll'. :param n: the length of n-grams :param freqsType: relative, absolute or binarised freqs :return: the collection with, for each text, a 'wordCounts' dictionary diff --git a/superstyl/preproc/pipe.py b/superstyl/preproc/pipe.py index c86949da..81819a00 100755 --- a/superstyl/preproc/pipe.py +++ b/superstyl/preproc/pipe.py @@ -52,6 +52,133 @@ def XML_to_text(path): return aut, re.sub(r"\s+", " ", str(myxsl(my_doc))) +def txm_to_units(path, units="lines"): + #TODO: it would be fairly easy to implement lemma and pos feats, like for tei. If it is ever useful + myxsl = etree.XML(''' + + + + + + + + + + + + + + + + + + + + + + + + + +''') + myxsl = etree.XSLT(myxsl) + + with open(path, 'r') as f: + my_doc = etree.parse(f) + + #units_tokens = str(myxsl(my_doc, units=etree.XSLT.strparam(units), feats=etree.XSLT.strparam(feats))).splitlines() + units_tokens = str(myxsl(my_doc, units=etree.XSLT.strparam(units))).splitlines() + return units_tokens + +def tei_to_units(path, feats="words", units="lines"): + + if feats in ["met_syll", "met_line"]: + feats = "met" + myxsl = etree.XML(''' + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ''') + myxsl = etree.XSLT(myxsl) + + with open(path, 'r') as f: + my_doc = etree.parse(f) + + units_tokens = str(myxsl(my_doc, units=etree.XSLT.strparam(units), feats=etree.XSLT.strparam(feats))).splitlines() + return units_tokens + +def specialXML_to_text(path, format="tei", feats="words"): + aut = path.split('/')[-1].split("_")[0] + if format=="tei": + units_tokens = tei_to_units(path, feats=feats, units="words") + + if format=="txm": + units_tokens = txm_to_units(path, feats=feats, units="words") + + return aut, re.sub(r"\s+", " ", str(' '.join(units_tokens))) + def TXT_to_text(path): """ Get main text from xml file @@ -147,7 +274,7 @@ def load_texts(paths, identify_lang=False, feats="words", format="txt", keep_pun Loads a collection of documents into a 'myTexts' object for further processing. TODO: a proper class :param paths: path to docs - TODO: add feats! + :param feats: the type of features, one of 'words', 'chars', 'affixes, 'lemma', 'pos', 'met_line' and 'met_syll'. :param identify_lang: whether or not try to identify lang (default: False) :param format: format of the source files (implemented values: txt [default], xml) :param keep_punct: whether or not to keep punctuation and caps. @@ -165,6 +292,9 @@ def load_texts(paths, identify_lang=False, feats="words", format="txt", keep_pun if format=='xml': aut, text = XML_to_text(path) + if format in ('tei', 'txm'): + aut, text = specialXML_to_text(path, format=format, feats=feats) + else: aut, text = TXT_to_text(path) @@ -198,7 +328,7 @@ def get_samples(path, size, step=None, samples_random=False, max_samples=10, :param max_samples: maximum number of samples per author/clas :param units: the units to use, one of "words" or "verses" :param format: type of document, one of full text, TEI or simple XML (ONLY TEI and TXT IMPLEMENTED) - :param feats: the type of features, TODO: document + :param feats: the type of features, one of 'words', 'chars', 'affixes, 'lemma', 'pos', 'met_line' and 'met_syll'. """ if samples_random and step is not None: @@ -213,127 +343,26 @@ def get_samples(path, size, step=None, samples_random=False, max_samples=10, if units == "words" and format == "txt": my_doc = TXT_to_text(path) text = normalise(my_doc[1], keep_punct=keep_punct, keep_sym=keep_sym, no_ascii=no_ascii) - units = nltk.tokenize.wordpunct_tokenize(text) + units_tokens = nltk.tokenize.wordpunct_tokenize(text) - #TODO: DOCUMENT this format as TXM, and keep it only for retrocompatibility + #Kept only for retrocompatibility with Psysché if units == "verses" and format == "txm": - myxsl = etree.XML(''' - - - - - - - - - - - - - - - - - - ''') - myxsl = etree.XSLT(myxsl) - - with open(path, 'r') as f: - my_doc = etree.parse(f) - - units = str(myxsl(my_doc)).splitlines() - - # and now generating output - samples = [] + units_tokens = txm_to_units(path, units=units) if format == "tei": - if feats in ["met_syll", "met_line"]: - feats = "met" - myxsl = etree.XML(''' - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -''') - myxsl = etree.XSLT(myxsl) - - with open(path, 'r') as f: - my_doc = etree.parse(f) - - units = str(myxsl(my_doc, units=etree.XSLT.strparam(units), feats=etree.XSLT.strparam(feats))).splitlines() + units_tokens = tei_to_units(path, units=units, feats=feats) # and now generating output samples = [] if samples_random: for k in range(max_samples): - samples.append({"start": str(k)+'s', "end": str(k)+'e', "text": list(random.choices(units, k=size))}) + samples.append({"start": str(k)+'s', "end": str(k)+'e', "text": list(random.choices(units_tokens, k=size))}) else: current = 0 - while current + size <= len(units): - samples.append({"start": current, "end": current + size, "text": list(units[current:(current + size)])}) + while current + size <= len(units_tokens): + samples.append({"start": current, "end": current + size, "text": list(units_tokens[current:(current + size)])}) current = current + step return samples @@ -353,7 +382,7 @@ def docs_to_samples(paths, size, step=None, units="words", samples_random=False, :param keep_punct: whether to keep punctuation and caps. :param max_samples: maximum number of samples per author/class. :param identify_lang: whether to try to identify lang (default: False) - :param feats: TODO + :param feats: the type of features, one of 'words', 'chars', 'affixes, 'lemma', 'pos', 'met_line' and 'met_syll'. :return: a myTexts object """ myTexts = [] From 290286ca607314af56c8bb0fe4a344bfa9609ac8 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Camps Date: Thu, 20 Nov 2025 15:12:08 +0100 Subject: [PATCH 06/10] fixed typos --- superstyl/load.py | 4 ++-- superstyl/preproc/features_extract.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/superstyl/load.py b/superstyl/load.py index 7c50a790..fe52683e 100644 --- a/superstyl/load.py +++ b/superstyl/load.py @@ -46,10 +46,10 @@ def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, freqsTyp :return a pandas dataFrame of text metadata and feature frequencies; a global list of features with their frequencies """ - if feats in ('lemma', 'pos', 'met_line', 'met_syll') and format is not 'tei': + if feats in ('lemma', 'pos', 'met_line', 'met_syll') and format != 'tei': raise ValueError("lemma, pos, met_line or met_syll are only possible with adequate tei format (@lemma, @pos, @met)") - if feats in ('met_line', 'met_syll') and units is not 'lines': + if feats in ('met_line', 'met_syll') and units != 'lines': raise ValueError("met_line or met_syll are only possible with tei format that includes lines and @met") embeddedFreqs = False diff --git a/superstyl/preproc/features_extract.py b/superstyl/preproc/features_extract.py index 3ef055d3..d04d8c12 100755 --- a/superstyl/preproc/features_extract.py +++ b/superstyl/preproc/features_extract.py @@ -10,7 +10,7 @@ def count_features(text, feats ="words", n = 1): Get feature counts from a text (words, chars or POS n-grams, or affixes(+punct if keep_punct), following Sapkota et al., NAACL 2015 :param text: the source text - :param feats: the type of feats: words, chars, POS (supported only for English), or affixes + :param feats: the type of features, one of 'words', 'chars', 'affixes, 'lemma', 'pos', 'met_line' and 'met_syll'. :param n: the length of n-grams :return: features absolute frequencies in text as a counter, and the total of frequencies """ @@ -20,9 +20,9 @@ def count_features(text, feats ="words", n = 1): raise ValueError("Text cannot be empty.") if n < 1 or not isinstance(n, int): raise ValueError("n must be a positive integer.") - if feats not in ["words", "chars", "affixes", "lemmas", "pos", "met_line", "met_syll"]: - raise ValueError("Unsupported feature type. Choose from 'words', 'chars', 'affixes', 'met_line', 'met_syll', 'lemmas' or 'pos'.") - if feats in ("words", "lemmas", "pos"): + if feats not in ["words", "chars", "affixes", "lemma", "pos", "met_line", "met_syll"]: + raise ValueError("Unsupported feature type. Choose from 'words', 'chars', 'affixes', 'met_line', 'met_syll', 'lemma' or 'pos'.") + if feats in ("words", "lemma", "pos"): tokens = nltk.tokenize.wordpunct_tokenize(text) if n > 1: tokens = ["_".join(t) for t in list(nltk.ngrams(tokens, n))] From 8008860198bdce542d05cd6920fd6030c77473ee Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Camps Date: Thu, 20 Nov 2025 15:20:24 +0100 Subject: [PATCH 07/10] suppressed test for pos tagging --- load_corpus.py | 11 ++++++----- tests/test_load_corpus.py | 17 +---------------- 2 files changed, 7 insertions(+), 21 deletions(-) diff --git a/load_corpus.py b/load_corpus.py index f325f30d..668e7524 100755 --- a/load_corpus.py +++ b/load_corpus.py @@ -18,18 +18,19 @@ parser.add_argument('-f', action="store", help="optional list of features, either in json (generated by" " Superstyl) or simple txt (one word per line)", default=False) parser.add_argument('-t', action='store', help="types of features (words, chars, affixes - " - "as per Sapkota et al. 2015 - or pos). pos are currently" - "only implemented for Modern English", type=str, - default="words", choices=["words", "chars", "affixes", "pos"]) + "as per Sapkota et al. 2015 -, as well as lemma or pos, met_line, " + "met_syll (those four last only for TEI files with proper annotation)" + , type=str, + default="words", choices=["words", "chars", "affixes", "pos", "lemma", "met_line", "met_syll"]) parser.add_argument('-n', action='store', help="n grams lengths (default 1)", default=1, type=int) parser.add_argument('-k', action='store', help="How many most frequent?", default=5000, type=int) parser.add_argument('--freqs', action='store', help="relative, absolute or binarised freqs", default="relative", choices=["relative", "absolute", "binary"] ) - parser.add_argument('-x', action='store', help="format (txt, xml or tei) WARNING: only txt is fully implemented", + parser.add_argument('-x', action='store', help="format (txt, xml, tei, or txm) WARNING: only txt is fully implemented", default="txt", - choices=["txt", "xml", "tei"] + choices=["txt", "xml", "tei", 'txm'] ) parser.add_argument('--sampling', action='store_true', help="Sample the texts?", default=False) parser.add_argument('--sample_units', action='store', help="Units of length for sampling " diff --git a/tests/test_load_corpus.py b/tests/test_load_corpus.py index 5de65d26..076aa199 100644 --- a/tests/test_load_corpus.py +++ b/tests/test_load_corpus.py @@ -286,22 +286,7 @@ def test_load_corpus(self): self.assertEqual(sorted(feats), sorted(expected_feats)) self.assertEqual(corpus.to_dict(), expected_corpus) - # WHEN - corpus, feats = superstyl.load.load_corpus(sorted(self.paths[1:]), feats="pos", n=1, format="txt", freqsType="absolute") - - # THEN - expected_feats = [('DT', 4), ('NN', 2), ('VBZ', 2), ('RB', 1)] - expected_corpus = { - 'author': {'Smith_Letter1.txt': 'Smith', 'Smith_Letter2.txt': 'Smith'}, - 'lang': {'Smith_Letter1.txt': 'NA', 'Smith_Letter2.txt': 'NA'}, - 'DT': {'Smith_Letter1.txt': 2 , 'Smith_Letter2.txt': 2}, - 'NN': {'Smith_Letter1.txt': 1 , 'Smith_Letter2.txt': 1}, - 'VBZ': {'Smith_Letter1.txt': 1, 'Smith_Letter2.txt': 1}, - 'RB': {'Smith_Letter1.txt': 0, 'Smith_Letter2.txt': 1} - } - - self.assertEqual(sorted(feats), sorted(expected_feats)) - self.assertEqual(corpus.to_dict(), expected_corpus) + # TODO: add tests for lemma, pos, met_line, met_syll, and loading from tei, and from txm # Now, test embedding # WHEN From e58cf74d0678afd00504f5629ed16f5e44acf182 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Camps Date: Thu, 20 Nov 2025 15:22:08 +0100 Subject: [PATCH 08/10] cleaned imports --- superstyl/load_from_config.py | 2 -- superstyl/preproc/pipe.py | 2 -- 2 files changed, 4 deletions(-) diff --git a/superstyl/load_from_config.py b/superstyl/load_from_config.py index 5dc7d3a6..220b0b21 100644 --- a/superstyl/load_from_config.py +++ b/superstyl/load_from_config.py @@ -1,7 +1,5 @@ import json -import superstyl import pandas as pd -import os import glob from superstyl.load import load_corpus diff --git a/superstyl/preproc/pipe.py b/superstyl/preproc/pipe.py index 81819a00..811451cd 100755 --- a/superstyl/preproc/pipe.py +++ b/superstyl/preproc/pipe.py @@ -1,5 +1,3 @@ -import unicodedata - from lxml import etree import regex as re import unidecode From 05885dfb05ae814b1d238d3657494910c57a0e84 Mon Sep 17 00:00:00 2001 From: Theo Date: Thu, 27 Nov 2025 13:36:24 +0100 Subject: [PATCH 09/10] fix column rename --- superstyl/load_from_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/superstyl/load_from_config.py b/superstyl/load_from_config.py index 5e3bfb3e..4bc7cca4 100644 --- a/superstyl/load_from_config.py +++ b/superstyl/load_from_config.py @@ -181,7 +181,7 @@ def load_corpus_from_config(config_path, is_test=False): feature_cols = [col for col in corpus.columns if col not in ['author', 'lang']] # Rename columns to avoid duplicates - renamed_cols = {col: col for col in feature_cols} + renamed_cols = {col: f"{name}_{col}" for col in feature_cols} feature_df = corpus[feature_cols].rename(columns=renamed_cols) # Merge with the main DataFrame From 011785119cf8ef4ac5ae5b6339c80679f0be1914 Mon Sep 17 00:00:00 2001 From: Theo Date: Thu, 27 Nov 2025 15:44:07 +0100 Subject: [PATCH 10/10] Add feats option to the txm extraction function + add unit tests --- .coverage | Bin 0 -> 53248 bytes superstyl/preproc/pipe.py | 30 ++- tests/test_error_handling.py | 320 +++++++++++++++++++++++++++++ tests/test_xml_loading.py | 304 +++++++++++++++++++++++++++ tests/testdata/Dupont_TEIPoem1.xml | 30 +++ tests/testdata/Smith_Song1.xml | 5 + tests/testdata/Smith_TXM1.xml | 55 +++++ 7 files changed, 739 insertions(+), 5 deletions(-) create mode 100644 .coverage create mode 100644 tests/test_error_handling.py create mode 100644 tests/test_xml_loading.py create mode 100644 tests/testdata/Dupont_TEIPoem1.xml create mode 100644 tests/testdata/Smith_Song1.xml create mode 100644 tests/testdata/Smith_TXM1.xml diff --git a/.coverage b/.coverage new file mode 100644 index 0000000000000000000000000000000000000000..cb8c5ebec31f2b245cb66d3681fec473c5d7d5c4 GIT binary patch literal 53248 zcmeI4eQXow9mntNvmL+PfgB6snaI4Y+dPw*rtx5B51tdb1$(S zlBzO6sLl6e-*b1*^L+33e4gL)JbrB7)4zFxW~uU^ZWI(t{s=dZ<9Y5bS>`w{Lhm@e zy)8rsf_Fk8Uw1syVT4PLJQ$a@ak1dToU|*xN$QRrjo%i1DOLzS6g?JF!WoLd1_B@e z0w6HA2~;-6LWzY7`TM_QDe1gw8A?VqoPGS0Th^`Fx=!A@<`Wy%$<8%-aYUxAyIWo( z8~Pr(q#E*|mRDu1nAI|hr4@6sHKcl}%ch!T107wXp^ha?+WFa0O3P9%OU;o)$>Rb6(i(c9$I|7zRf|aHwcTZP zvhus+TFx%J7)8BHW`!URSQHH=`dWF8yiy%CcjxKzR!S*bx_zFa%1`yuF81a5Eulnv zJAYue?I;XSDy>=OX(lhv;5saRX1`&rq|r%OE6V3(eb8<)EVt}b4taBZo9Xdj zCBc(U z!u56}a-H6AFtM(ECfAvCfmHA0VWhG!7)rFY@%M*qH*#ys4MGhrc5-Uo zpVLiKqhXZ|Tjm|!Rk>qGFfRqFd$<7D@7IuXh!cTl4=sfkX}^hDiQ)?<14 z$r3SPss$zMG791V8`;KmY_l00ck)1V8`;KmY_l;QdFy z&-?fgoB#Wy)135g+F=6$5C8!X009sH0T2KI5C8!X009tqZxV?4{5MP9Up$scyuU5s z{t4h?$rZ_E>`(qa=?zYLL;Bx)vw*M~1V8`;KmY_l00ck)1V8`;KmY_lz#|az-^_d8 z0{9~Swy^sp0Q>!4jCOL;_oST?C!P@3#$S&=7{4R-&)C7(is+xC`zaC|2!H?xfB*=9 z00@8p2+TDCTSbAZboUJD1+~X2(Bchd&ssfGE~rJzq%{!=C3?h{EhwPm<u03s>G?q| zN7@51(q7rLcAdr3h<_nU;Y+6dCR5nR> zW0-Vry-M9wN-JtsDn)vqZ6Uq(ruAwB98VYskzDh&Rg?38s#^1Ejuw7S#vofqPnY!c5U!Xbm9SD+Ibn^ImZ!o|0Bl$@U$7(w&V4 z=#i=Okc~dlZrO+w|?tJSTWIfXY0 zw7gr>4pAfE%>P@iuC{7&9?<&vf9PJJa_4N=>WMeZ|AQH!vVJzzr|Gq6kr87JW9#>~0*8iJx0~rbo0w4eaAOHd&00JNY0w4ea zATYNG_<27UXTSgR(q)cb*gyaTKmY_l00ck)1V8`;KmY_l00iEL1pNMZ*na+BT**ln zrB|hwrN^a;G%9J*aw#ePMtnvz#eRy!1_B@e0w4eaAOHd&00JNY0w6G#3EUtGd^jmi z9I2kYTl(uakN>)Q{#%y|*`>F{>H4bpE}m;^ziCO|nbp;5?(o=nwc7vb4F|gpE$E9; zycL`HVE4s+?UzG)qJR7LvZ9ge`ugMZ51stc$)hJfp1b(`JtN87nK!GKs;763zOee} z-`ihM|8!XRK$Nn6IC^Xn`?z~Fo{a?faOG2xiS1k8Uj6oqmicCtZ7+Vb^(_ka5BgY8 zz3n;Lz4Geiue|)*AO1esdhX1{>Z6Y)dRD!rEjj$ihsG}a;Tsd@bNep@)}A;u{P2w@ zPAPW}A33t)m@zzj`uo*wsY)Ac|1)i(y{V0mCK|DB)z zm^i}S$C!g|d8c?|>7VQ}B z<$iE(y|pXI?S155-9bNP75%=1kHTfXeYdbo2Y<2Z?BRV+|M4)lg%9vTI4lP3=l`WA zIO&pfUOFeeF1 + @@ -79,7 +86,21 @@ def txm_to_units(path, units="lines"): - + + + + + + + + + + + + + + + ''') @@ -88,8 +109,7 @@ def txm_to_units(path, units="lines"): with open(path, 'r') as f: my_doc = etree.parse(f) - #units_tokens = str(myxsl(my_doc, units=etree.XSLT.strparam(units), feats=etree.XSLT.strparam(feats))).splitlines() - units_tokens = str(myxsl(my_doc, units=etree.XSLT.strparam(units))).splitlines() + units_tokens = str(myxsl(my_doc, units=etree.XSLT.strparam(units), feats=etree.XSLT.strparam(feats))).splitlines() return units_tokens def tei_to_units(path, feats="words", units="lines"): diff --git a/tests/test_error_handling.py b/tests/test_error_handling.py new file mode 100644 index 00000000..cc37116a --- /dev/null +++ b/tests/test_error_handling.py @@ -0,0 +1,320 @@ +import unittest +import superstyl.load +import superstyl.preproc.features_extract +from superstyl.load_from_config import load_corpus_from_config +import os +import tempfile +import json +import glob + +THIS_DIR = os.path.dirname(os.path.abspath(__file__)) + + +class ErrorHandlingTests(unittest.TestCase): + """Tests for error handling and ValueError raising""" + + def setUp(self): + """Set up test files paths""" + self.test_paths = sorted(glob.glob(os.path.join(THIS_DIR, "testdata/*.txt"))) + self.temp_dir = tempfile.TemporaryDirectory() + + def tearDown(self): + """Clean up temporary directory""" + self.temp_dir.cleanup() + + # ========================================================================= + # Tests pour load.py - ValueError pour formats incompatibles + # ========================================================================= + + def test_load_corpus_lemma_requires_tei(self): + # SCENARIO: lemma features require TEI format + # GIVEN: Attempting to use lemma with non-TEI format + + # WHEN/THEN: Should raise ValueError + with self.assertRaises(ValueError) as context: + superstyl.load.load_corpus( + self.test_paths, + feats="lemma", + format="txt" + ) + + self.assertIn("lemma", str(context.exception)) + self.assertIn("tei", str(context.exception).lower()) + + def test_load_corpus_pos_requires_tei(self): + # SCENARIO: pos features require TEI format + # GIVEN: Attempting to use pos with non-TEI format + + # WHEN/THEN: Should raise ValueError + with self.assertRaises(ValueError) as context: + superstyl.load.load_corpus( + self.test_paths, + feats="pos", + format="txt" + ) + + self.assertIn("pos", str(context.exception)) + self.assertIn("tei", str(context.exception).lower()) + + def test_load_corpus_met_line_requires_tei(self): + # SCENARIO: met_line features require TEI format + # GIVEN: Attempting to use met_line with non-TEI format + + # WHEN/THEN: Should raise ValueError + with self.assertRaises(ValueError) as context: + superstyl.load.load_corpus( + self.test_paths, + feats="met_line", + format="txt" + ) + + self.assertIn("met_line", str(context.exception)) + self.assertIn("tei", str(context.exception).lower()) + + def test_load_corpus_met_syll_requires_tei(self): + # SCENARIO: met_syll features require TEI format + # GIVEN: Attempting to use met_syll with non-TEI format + + # WHEN/THEN: Should raise ValueError + with self.assertRaises(ValueError) as context: + superstyl.load.load_corpus( + self.test_paths, + feats="met_syll", + format="txt" + ) + + self.assertIn("met_syll", str(context.exception)) + self.assertIn("tei", str(context.exception).lower()) + + def test_load_corpus_met_line_requires_lines_unit(self): + # SCENARIO: met_line requires units='lines' + # GIVEN: Attempting to use met_line with units='words' + + # Create a dummy TEI file for this test + tei_path = os.path.join(self.temp_dir.name, "test_met.xml") + with open(tei_path, 'w') as f: + f.write('test') + + # WHEN/THEN: Should raise ValueError + with self.assertRaises(ValueError) as context: + superstyl.load.load_corpus( + [tei_path], + feats="met_line", + format="tei", + units="words" # Wrong unit type + ) + + self.assertIn("met_line", str(context.exception)) + self.assertIn("lines", str(context.exception)) + + def test_load_corpus_met_syll_requires_lines_unit(self): + # SCENARIO: met_syll requires units='lines' + # GIVEN: Attempting to use met_syll with units='words' + + # Create a dummy TEI file for this test + tei_path = os.path.join(self.temp_dir.name, "test_met2.xml") + with open(tei_path, 'w') as f: + f.write('test') + + # WHEN/THEN: Should raise ValueError + with self.assertRaises(ValueError) as context: + superstyl.load.load_corpus( + [tei_path], + feats="met_syll", + format="tei", + units="words" # Wrong unit type + ) + + self.assertIn("met_syll", str(context.exception)) + self.assertIn("lines", str(context.exception)) + + # ========================================================================= + # Tests pour features_extract.py - ValueError pour paramètres invalides + # ========================================================================= + + def test_count_features_empty_text(self): + # SCENARIO: Empty text should raise ValueError + # GIVEN: An empty string as text + + # WHEN/THEN: Should raise ValueError + with self.assertRaises(ValueError) as context: + superstyl.preproc.features_extract.count_features( + "", # Empty text + feats="words", + n=1 + ) + + self.assertIn("empty", str(context.exception).lower()) + + def test_count_features_invalid_n_zero(self): + # SCENARIO: n must be positive + # GIVEN: n=0 + + # WHEN/THEN: Should raise ValueError + with self.assertRaises(ValueError) as context: + superstyl.preproc.features_extract.count_features( + "test text", + feats="words", + n=0 # Invalid n + ) + + self.assertIn("positive", str(context.exception).lower()) + + def test_count_features_invalid_n_negative(self): + # SCENARIO: n must be positive + # GIVEN: n=-1 + + # WHEN/THEN: Should raise ValueError + with self.assertRaises(ValueError) as context: + superstyl.preproc.features_extract.count_features( + "test text", + feats="words", + n=-1 # Invalid n + ) + + self.assertIn("positive", str(context.exception).lower()) + + def test_count_features_invalid_n_not_integer(self): + # SCENARIO: n must be an integer + # GIVEN: n=1.5 (float) + + # WHEN/THEN: Should raise ValueError + with self.assertRaises(ValueError) as context: + superstyl.preproc.features_extract.count_features( + "test text", + feats="words", + n=1.5 # Not an integer + ) + + self.assertIn("integer", str(context.exception).lower()) + + def test_count_features_invalid_not_string(self): + # SCENARIO: text must be a string + # GIVEN: text is not a string (e.g., None) + + # WHEN/THEN: Should raise ValueError + with self.assertRaises(ValueError) as context: + superstyl.preproc.features_extract.count_features( + None, # Not a string + feats="words", + n=1 + ) + + self.assertIn("string", str(context.exception).lower()) + + def test_count_features_unsupported_feats_type(self): + # SCENARIO: feats must be a supported type + # GIVEN: An unsupported feats type + + # WHEN/THEN: Should raise ValueError + with self.assertRaises(ValueError) as context: + superstyl.preproc.features_extract.count_features( + "test text", + feats="unsupported_type", # Invalid feats type + n=1 + ) + + self.assertIn("Unsupported", str(context.exception)) + + def test_get_counts_invalid_frequency_type(self): + # SCENARIO: freqsType must be valid + # GIVEN: An unsupported frequency type + + myTexts = [{"name": "test", "text": "test text"}] + + # WHEN/THEN: Should raise ValueError + with self.assertRaises(ValueError) as context: + superstyl.preproc.features_extract.get_counts( + myTexts, + feats="words", + freqsType="invalid_type" # Invalid frequency type + ) + + self.assertIn("Unsupported frequency type", str(context.exception)) + + # ========================================================================= + # Tests pour load_from_config.py - Branches non couvertes + # ========================================================================= + + def test_load_from_config_with_json_feature_list(self): + # SCENARIO: Load corpus with JSON feature list (ligne 119) + # GIVEN: A config with a JSON feature list + + # Create a JSON feature list + feature_list = [["the", 0], ["is", 0]] + feature_list_path = os.path.join(self.temp_dir.name, "features.json") + with open(feature_list_path, 'w') as f: + json.dump(feature_list, f) + + # Create config + config = { + "paths": self.test_paths, + "format": "txt", + "features": [ + { + "name": "test_feature", + "type": "words", + "n": 1, + "feat_list": feature_list_path # JSON feature list + } + ] + } + + config_path = os.path.join(self.temp_dir.name, "config.json") + with open(config_path, 'w') as f: + json.dump(config, f) + + # WHEN: Loading corpus from config + corpus, features = load_corpus_from_config(config_path) + + # THEN: Should load successfully with JSON feature list + self.assertIsNotNone(corpus) + self.assertIsNotNone(features) + + def test_load_from_config_test_mode_uses_feat_list(self): + # SCENARIO: In test mode, use provided feat_list (ligne 156) + # GIVEN: A config with feat_list in test mode + + # Create a JSON feature list + feature_list = [["the", 0], ["is", 0], ["text", 0]] + feature_list_path = os.path.join(self.temp_dir.name, "test_features.json") + with open(feature_list_path, 'w') as f: + json.dump(feature_list, f) + + # Create config with multiple features (triggers is_test logic) + config = { + "paths": self.test_paths, + "format": "txt", + "features": [ + { + "name": "feat1", + "type": "words", + "n": 1, + "feat_list": feature_list_path + }, + { + "name": "feat2", + "type": "chars", + "n": 2, + "feat_list": feature_list_path + } + ] + } + + config_path = os.path.join(self.temp_dir.name, "multi_config.json") + with open(config_path, 'w') as f: + json.dump(config, f) + + # WHEN: Loading corpus from config + corpus, features = load_corpus_from_config(config_path, is_test=True) + + # THEN: Should use the provided feature list + self.assertIsNotNone(corpus) + self.assertIsNotNone(features) + # features should be a list of feature lists + self.assertIsInstance(features, list) + self.assertEqual(len(features), 2) # Two feature sets + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tests/test_xml_loading.py b/tests/test_xml_loading.py new file mode 100644 index 00000000..c874bbb8 --- /dev/null +++ b/tests/test_xml_loading.py @@ -0,0 +1,304 @@ +import unittest +import superstyl.preproc.pipe +import os +import glob + +THIS_DIR = os.path.dirname(os.path.abspath(__file__)) + + +class XMLLoadingTests(unittest.TestCase): + """Tests for XML, TEI, and TXM file loading functions""" + + def setUp(self): + """Set up test files paths""" + self.xml_path = os.path.join(THIS_DIR, "testdata", "Smith_Song1.xml") + self.tei_path = os.path.join(THIS_DIR, "testdata", "Dupont_TEIPoem1.xml") + self.txm_path = os.path.join(THIS_DIR, "testdata", "Smith_TXM1.xml") + + def test_XML_to_text(self): + # SCENARIO: Load text from a simple XML file + # GIVEN: An XML file with author and text elements + + # WHEN: Loading the XML file + aut, text = superstyl.preproc.pipe.XML_to_text(self.xml_path) + + # THEN: Author and text are correctly extracted + self.assertEqual(aut, "Smith") + self.assertIn("test song", text) + self.assertIn("lyrics", text) + # Check that whitespace is normalized + self.assertNotIn(" ", text) + + def test_tei_to_units_words(self): + # SCENARIO: Extract words from a TEI file + # GIVEN: A TEI file with annotated words + + # WHEN: Extracting words as units + units_tokens = superstyl.preproc.pipe.tei_to_units( + self.tei_path, + feats="words", + units="words" + ) + + # THEN: Words are extracted, one per line + self.assertIsInstance(units_tokens, list) + self.assertGreater(len(units_tokens), 0) + # Each word should be on a separate line + self.assertIn("This", [u.strip() for u in units_tokens]) + self.assertIn("is", [u.strip() for u in units_tokens]) + + def test_tei_to_units_verses(self): + # SCENARIO: Extract verses (lines) from a TEI file + # GIVEN: A TEI file with verse lines + + # WHEN: Extracting verses as units + units_tokens = superstyl.preproc.pipe.tei_to_units( + self.tei_path, + feats="words", + units="verses" + ) + + # THEN: Each verse is on a separate line + self.assertIsInstance(units_tokens, list) + # We should have 2 lines in our test file + self.assertEqual(len(units_tokens), 2) + + def test_tei_to_units_lemma(self): + # SCENARIO: Extract lemmas from a TEI file + # GIVEN: A TEI file with lemma annotations + + # WHEN: Extracting lemmas + units_tokens = superstyl.preproc.pipe.tei_to_units( + self.tei_path, + feats="lemma", + units="words" + ) + + # THEN: Lemmas are extracted + self.assertIsInstance(units_tokens, list) + self.assertIn("this", [u.strip() for u in units_tokens]) + self.assertIn("be", [u.strip() for u in units_tokens]) + + def test_tei_to_units_pos(self): + # SCENARIO: Extract POS tags from a TEI file + # GIVEN: A TEI file with POS annotations + + # WHEN: Extracting POS tags + units_tokens = superstyl.preproc.pipe.tei_to_units( + self.tei_path, + feats="pos", + units="words" + ) + + # THEN: POS tags are extracted + self.assertIsInstance(units_tokens, list) + self.assertIn("DET", [u.strip() for u in units_tokens]) + self.assertIn("VERB", [u.strip() for u in units_tokens]) + + def test_tei_to_units_met_syll(self): + # SCENARIO: Extract metrical syllables from a TEI file + # GIVEN: A TEI file with metrical annotations + + # WHEN: Extracting metrical syllables with met_syll feature + units_tokens = superstyl.preproc.pipe.tei_to_units( + self.tei_path, + feats="met_syll", + units="verses" + ) + + # THEN: Metrical annotations are extracted + self.assertIsInstance(units_tokens, list) + # The @met attributes should be present + self.assertGreater(len(units_tokens), 0) + + def test_tei_to_units_met_line(self): + # SCENARIO: Extract metrical lines from a TEI file + # GIVEN: A TEI file with metrical annotations on lines + + # WHEN: Extracting metrical patterns at line level + units_tokens = superstyl.preproc.pipe.tei_to_units( + self.tei_path, + feats="met_line", + units="verses" + ) + + # THEN: Metrical patterns for each line are extracted + self.assertIsInstance(units_tokens, list) + self.assertEqual(len(units_tokens), 2) + # Should contain the metrical patterns + self.assertIn("01010101", units_tokens[0]) + self.assertIn("10101010", units_tokens[1]) + + def test_txm_to_units_words(self): + # SCENARIO: Extract words from a TXM file + # GIVEN: A TXM file with annotated words + + # WHEN: Extracting words as units + units_tokens = superstyl.preproc.pipe.txm_to_units( + self.txm_path, + units="words" + ) + + # THEN: Words are extracted + # Note: When extracting individual words (units='words'), + # the NOMpro filter is not applied + self.assertIsInstance(units_tokens, list) + self.assertGreater(len(units_tokens), 0) + text_content = ' '.join(units_tokens) + # All words should be present including those with NOMpro + self.assertIn("This", text_content) + self.assertIn("test", text_content) + + def test_txm_to_units_verses(self): + # SCENARIO: Extract verses from a TXM file + # GIVEN: A TXM file with verse lines + + # WHEN: Extracting verses as units + units_tokens = superstyl.preproc.pipe.txm_to_units( + self.txm_path, + units="verses" + ) + + # THEN: Each verse is extracted and NOMpro words are filtered out + self.assertIsInstance(units_tokens, list) + self.assertEqual(len(units_tokens), 2) + # Check that NOMpro words are excluded in verse mode + text_content = ' '.join(units_tokens) + self.assertNotIn("here", text_content) # "here" has NOMpro tag and should be filtered + self.assertIn("This", text_content) # Regular words should be present + + def test_txm_to_units_lemma(self): + # SCENARIO: Extract lemmas from a TXM file + # GIVEN: A TXM file with lemma annotations + + # WHEN: Extracting lemmas + units_tokens = superstyl.preproc.pipe.txm_to_units( + self.txm_path, + units="words", + feats="lemma" + ) + + # THEN: Lemmas are extracted + self.assertIsInstance(units_tokens, list) + self.assertIn("be", [u.strip() for u in units_tokens]) # lemma of "is" + self.assertIn("this", [u.strip() for u in units_tokens]) + + def test_txm_to_units_pos(self): + # SCENARIO: Extract POS tags from a TXM file + # GIVEN: A TXM file with POS annotations + + # WHEN: Extracting POS tags + units_tokens = superstyl.preproc.pipe.txm_to_units( + self.txm_path, + units="words", + feats="pos" + ) + + # THEN: POS tags are extracted + self.assertIsInstance(units_tokens, list) + self.assertIn("DET", [u.strip() for u in units_tokens]) + self.assertIn("VERB", [u.strip() for u in units_tokens]) + + def test_specialXML_to_text_tei(self): + # SCENARIO: Load text from a TEI file using specialXML_to_text + # GIVEN: A TEI format file + + # WHEN: Loading with format="tei" + aut, text = superstyl.preproc.pipe.specialXML_to_text( + self.tei_path, + format="tei", + feats="words" + ) + + # THEN: Author is extracted from filename and text is normalized + self.assertEqual(aut, "Dupont") + self.assertIsInstance(text, str) + self.assertGreater(len(text), 0) + # Check that whitespace is normalized (single spaces) + self.assertNotIn(" ", text) + + def test_specialXML_to_text_txm(self): + # SCENARIO: Load text from a TXM file using specialXML_to_text + # GIVEN: A TXM format file + + # WHEN: Loading with format="txm" + aut, text = superstyl.preproc.pipe.specialXML_to_text( + self.txm_path, + format="txm", + feats="words" + ) + + # THEN: Author is extracted from filename and text is normalized + self.assertEqual(aut, "Smith") + self.assertIsInstance(text, str) + self.assertGreater(len(text), 0) + # Text should contain words from the TXM file + self.assertIn("test", text.lower()) + + def test_specialXML_to_text_with_lemma(self): + # SCENARIO: Load lemmas from a TEI file + # GIVEN: A TEI file with lemma annotations + + # WHEN: Loading with feats="lemma" + aut, text = superstyl.preproc.pipe.specialXML_to_text( + self.tei_path, + format="tei", + feats="lemma" + ) + + # THEN: Lemmas are in the text + self.assertEqual(aut, "Dupont") + self.assertIn("be", text) # lemma of "is" + self.assertIn("this", text) + + def test_specialXML_to_text_with_pos(self): + # SCENARIO: Load POS tags from a TEI file + # GIVEN: A TEI file with POS annotations + + # WHEN: Loading with feats="pos" + aut, text = superstyl.preproc.pipe.specialXML_to_text( + self.tei_path, + format="tei", + feats="pos" + ) + + # THEN: POS tags are in the text + self.assertEqual(aut, "Dupont") + self.assertIn("DET", text) + self.assertIn("VERB", text) + + def test_specialXML_to_text_txm_with_lemma(self): + # SCENARIO: Load lemmas from a TXM file + # GIVEN: A TXM file with lemma annotations + + # WHEN: Loading with feats="lemma" + aut, text = superstyl.preproc.pipe.specialXML_to_text( + self.txm_path, + format="txm", + feats="lemma" + ) + + # THEN: Lemmas are in the text + self.assertEqual(aut, "Smith") + self.assertIn("be", text) # lemma of "is" + self.assertIn("this", text) + + def test_specialXML_to_text_txm_with_pos(self): + # SCENARIO: Load POS tags from a TXM file + # GIVEN: A TXM file with POS annotations + + # WHEN: Loading with feats="pos" + aut, text = superstyl.preproc.pipe.specialXML_to_text( + self.txm_path, + format="txm", + feats="pos" + ) + + # THEN: POS tags are in the text + self.assertEqual(aut, "Smith") + self.assertIn("DET", text) + self.assertIn("VERB", text) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tests/testdata/Dupont_TEIPoem1.xml b/tests/testdata/Dupont_TEIPoem1.xml new file mode 100644 index 00000000..a737babd --- /dev/null +++ b/tests/testdata/Dupont_TEIPoem1.xml @@ -0,0 +1,30 @@ + + + + + + Test Poem + + + + + + + + This + is + the + first + line + + + And + this + is + the + second + + + + + \ No newline at end of file diff --git a/tests/testdata/Smith_Song1.xml b/tests/testdata/Smith_Song1.xml new file mode 100644 index 00000000..d713cc96 --- /dev/null +++ b/tests/testdata/Smith_Song1.xml @@ -0,0 +1,5 @@ + + + Smith + This is a test song with some lyrics + \ No newline at end of file diff --git a/tests/testdata/Smith_TXM1.xml b/tests/testdata/Smith_TXM1.xml new file mode 100644 index 00000000..0411bf57 --- /dev/null +++ b/tests/testdata/Smith_TXM1.xml @@ -0,0 +1,55 @@ + + + + + + Test TXM Text + + + + + + + + + This + this + DET + + + is + be + VERB + + + a + a + DET + + + test + test + NOUN + + + + + Second + second + ADJ + + + line + line + NOUN + + + here + here + NOMpro + + + + + + \ No newline at end of file