diff --git a/.coverage b/.coverage new file mode 100644 index 00000000..cb8c5ebe Binary files /dev/null and b/.coverage differ diff --git a/load_corpus.py b/load_corpus.py index f325f30d..668e7524 100755 --- a/load_corpus.py +++ b/load_corpus.py @@ -18,18 +18,19 @@ parser.add_argument('-f', action="store", help="optional list of features, either in json (generated by" " Superstyl) or simple txt (one word per line)", default=False) parser.add_argument('-t', action='store', help="types of features (words, chars, affixes - " - "as per Sapkota et al. 2015 - or pos). pos are currently" - "only implemented for Modern English", type=str, - default="words", choices=["words", "chars", "affixes", "pos"]) + "as per Sapkota et al. 2015 -, as well as lemma or pos, met_line, " + "met_syll (those four last only for TEI files with proper annotation)" + , type=str, + default="words", choices=["words", "chars", "affixes", "pos", "lemma", "met_line", "met_syll"]) parser.add_argument('-n', action='store', help="n grams lengths (default 1)", default=1, type=int) parser.add_argument('-k', action='store', help="How many most frequent?", default=5000, type=int) parser.add_argument('--freqs', action='store', help="relative, absolute or binarised freqs", default="relative", choices=["relative", "absolute", "binary"] ) - parser.add_argument('-x', action='store', help="format (txt, xml or tei) WARNING: only txt is fully implemented", + parser.add_argument('-x', action='store', help="format (txt, xml, tei, or txm) WARNING: only txt is fully implemented", default="txt", - choices=["txt", "xml", "tei"] + choices=["txt", "xml", "tei", 'txm'] ) parser.add_argument('--sampling', action='store_true', help="Sample the texts?", default=False) parser.add_argument('--sample_units', action='store', help="Units of length for sampling " diff --git a/superstyl/load.py b/superstyl/load.py index 4528c6f6..fe52683e 100644 --- a/superstyl/load.py +++ b/superstyl/load.py @@ -14,10 +14,11 @@ def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, freqsTyp Main function to load a corpus from a collection of file, and an optional list of features to extract. :param data_paths: paths to the source files :param feat_list: an optional list of features (as created by load_corpus), default None - :param feats: the type of features, one of 'words', 'chars', 'affixes, and 'POS'. Affixes are inspired by - Sapkota et al. 2015, and include space_prefix, space_suffix, prefix, suffix, and, if keep_pos, punctuation n-grams. - POS are currently only implemented for Modern English - TODO: add met_line, met_syll + :param feats: the type of features, one of 'words', 'chars', 'affixes, 'lemma', 'pos', 'met_line' and 'met_syll'. + Affixes are inspired by Sapkota et al. 2015, and include space_prefix, space_suffix, prefix, suffix, and, + if keep_punct, punctuation n-grams. From TEI, pos, lemma, met_line or met_syll can + be extracted; met_line is the prosodic (stress) annotation of a full verse; met_syll is a char n-gram of prosodic + annotation :param n: n grams lengths (default 1) :param k: How many most frequent? The function takes the rank of k (if k is smaller than the total number of features), gets its frequencies, and only include features of superior or equal total frequencies. @@ -45,6 +46,12 @@ def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, freqsTyp :return a pandas dataFrame of text metadata and feature frequencies; a global list of features with their frequencies """ + if feats in ('lemma', 'pos', 'met_line', 'met_syll') and format != 'tei': + raise ValueError("lemma, pos, met_line or met_syll are only possible with adequate tei format (@lemma, @pos, @met)") + + if feats in ('met_line', 'met_syll') and units != 'lines': + raise ValueError("met_line or met_syll are only possible with tei format that includes lines and @met") + embeddedFreqs = False if embedding: print(".......loading embedding.......") @@ -81,7 +88,6 @@ def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, freqsTyp my_feats = [m[0] for m in feat_list] # keeping only the features without the frequencies myTexts = fex.get_counts(myTexts, feat_list=my_feats, feats=feats, n=n, freqsType=freqsType) - if embedding: print(".......embedding counts.......") myTexts, my_feats = embed.get_embedded_counts(myTexts, my_feats, model, topn=neighbouring_size) diff --git a/superstyl/load_from_config.py b/superstyl/load_from_config.py index 103dd853..4bc7cca4 100644 --- a/superstyl/load_from_config.py +++ b/superstyl/load_from_config.py @@ -1,12 +1,11 @@ import json -import superstyl import pandas as pd import os import glob from superstyl.load import load_corpus -def load_corpus_from_config(config_path): +def load_corpus_from_config(config_path, is_test=False): """ Load a corpus based on a JSON configuration file. @@ -55,7 +54,7 @@ def load_corpus_from_config(config_path): # Get sampling parameters sampling_params = config.get('sampling', {}) - + # Use the first feature to create the base corpus with sampling feature_configs = config.get('features', []) if not feature_configs: @@ -87,9 +86,9 @@ def load_corpus_from_config(config_path): 'sampling': sampling_params.get('enabled', False), 'units': sampling_params.get('units', 'words'), 'size': sampling_params.get('sample_size', 3000), - 'step': sampling_params.get('sample_step', None), + 'step': sampling_params.get('step', None), 'max_samples': sampling_params.get('max_samples', None), - 'samples_random': sampling_params.get('sample_random', False), + 'samples_random': sampling_params.get('samples_random', False), 'keep_punct': feature_config.get('keep_punct', False), 'keep_sym': feature_config.get('keep_sym', False), 'no_ascii': feature_config.get('no_ascii', False), @@ -115,6 +114,7 @@ def load_corpus_from_config(config_path): # Check for feature list file feat_list = None feat_list_path = feature_config.get('feat_list') + print(feat_list_path) if feat_list_path: if feat_list_path.endswith('.json'): with open(feat_list_path, 'r') as f: @@ -133,9 +133,9 @@ def load_corpus_from_config(config_path): 'sampling': sampling_params.get('enabled', False), 'units': sampling_params.get('units', 'words'), 'size': sampling_params.get('sample_size', 3000), - 'step': sampling_params.get('sample_step', None), + 'step': sampling_params.get('step', None), 'max_samples': sampling_params.get('max_samples', None), - 'samples_random': sampling_params.get('sample_random', False), + 'samples_random': sampling_params.get('samples_random', False), 'keep_punct': config.get('keep_punct', False), 'keep_sym': config.get('keep_sym', False), 'no_ascii': config.get('no_ascii', False), @@ -146,11 +146,17 @@ def load_corpus_from_config(config_path): } print(f"Loading {feature_name}...") + corpus, features = load_corpus(paths, feat_list=feat_list, **params) # Store corpus and features corpora[feature_name] = corpus - feature_lists[feature_name] = features + + if feat_list is not None and is_test: + feature_lists[feature_name] = feat_list + else: + feature_lists[feature_name] = features + # Create a merged dataset print("Creating merged dataset...") @@ -170,6 +176,8 @@ def load_corpus_from_config(config_path): # Add features from each corpus for name, corpus in corpora.items(): + single_feature = [] + feature_cols = [col for col in corpus.columns if col not in ['author', 'lang']] # Rename columns to avoid duplicates @@ -181,8 +189,9 @@ def load_corpus_from_config(config_path): # Add features to the combined list with prefixes for feature in feature_lists[name]: - all_features.append((f"{name}_{feature[0]}", feature[1])) + single_feature.append((feature[0], feature[1])) + all_features.append(single_feature) # Return the merged corpus and combined feature list return merged, all_features diff --git a/superstyl/preproc/features_extract.py b/superstyl/preproc/features_extract.py index fe6ba5c0..d04d8c12 100755 --- a/superstyl/preproc/features_extract.py +++ b/superstyl/preproc/features_extract.py @@ -10,7 +10,7 @@ def count_features(text, feats ="words", n = 1): Get feature counts from a text (words, chars or POS n-grams, or affixes(+punct if keep_punct), following Sapkota et al., NAACL 2015 :param text: the source text - :param feats: the type of feats: words, chars, POS (supported only for English), or affixes + :param feats: the type of features, one of 'words', 'chars', 'affixes, 'lemma', 'pos', 'met_line' and 'met_syll'. :param n: the length of n-grams :return: features absolute frequencies in text as a counter, and the total of frequencies """ @@ -20,9 +20,9 @@ def count_features(text, feats ="words", n = 1): raise ValueError("Text cannot be empty.") if n < 1 or not isinstance(n, int): raise ValueError("n must be a positive integer.") - if feats not in ["words", "chars", "affixes", "pos", "met_line", "met_syll"]: - raise ValueError("Unsupported feature type. Choose from 'words', 'chars', 'affixes', 'met_line', 'met_syll', or 'pos'.") - if feats == "words": + if feats not in ["words", "chars", "affixes", "lemma", "pos", "met_line", "met_syll"]: + raise ValueError("Unsupported feature type. Choose from 'words', 'chars', 'affixes', 'met_line', 'met_syll', 'lemma' or 'pos'.") + if feats in ("words", "lemma", "pos"): tokens = nltk.tokenize.wordpunct_tokenize(text) if n > 1: tokens = ["_".join(t) for t in list(nltk.ngrams(tokens, n))] @@ -46,20 +46,6 @@ def count_features(text, feats ="words", n = 1): ] tokens = affs + space_affs_and_punct - #POS in english with NLTK - need to propose spacy later on - elif feats == "pos": - try: - nltk.data.find('taggers/averaged_perceptron_tagger_eng') - except: - nltk.download('averaged_perceptron_tagger_eng') - words = nltk.tokenize.wordpunct_tokenize(text) - pos_tags = [pos for word, pos in nltk.pos_tag(words)] - if n > 1: - tokens = ["_".join(t) for t in list(nltk.ngrams(pos_tags, n))] - else: - tokens = pos_tags - total = len(tokens) - elif feats == "met_line": tokens = text.split() if n > 1: @@ -73,7 +59,7 @@ def count_features(text, feats ="words", n = 1): #Adding an error message in case some distracted guy like me would enter something wrong: else: - raise ValueError("Unsupported feature type. Choose from 'words', 'chars', 'affixes', 'met_line', 'met_syll' or 'pos'.") + raise ValueError("Unsupported feature type. Choose from 'words', 'chars', 'affixes', 'met_line', 'met_syll', 'lemmas' or 'pos'.") counts = Counter() counts.update(tokens) @@ -108,7 +94,7 @@ def get_feature_list(myTexts, feats="words", n=1, freqsType="relative"): """ :param myTexts: a 'myTexts' object, containing documents to be processed :param feat_list: a list of features to be selected - :param feats: type of feats (words, chars, affixes, POS, met_line, or met_syll) + :param feats: the type of features, one of 'words', 'chars', 'affixes, 'lemma', 'pos', 'met_line' and 'met_syll'. :param freqsType: "relative", "absolute" or "binary" frequencies :param n: n-grams length :return: list of features, with total frequency @@ -142,14 +128,12 @@ def get_doc_frequency(myTexts): return feats_doc_freq - - def get_counts(myTexts, feat_list=None, feats = "words", n = 1, freqsType = "relative"): """ Get counts for a collection of texts :param myTexts: the document collection :param feat_list: a list of features to be selected (None for all) - :param feats: the type of feats (words, chars, affixes, POS) + :param feats: the type of features, one of 'words', 'chars', 'affixes, 'lemma', 'pos', 'met_line' and 'met_syll'. :param n: the length of n-grams :param freqsType: relative, absolute or binarised freqs :return: the collection with, for each text, a 'wordCounts' dictionary diff --git a/superstyl/preproc/pipe.py b/superstyl/preproc/pipe.py index c86949da..7c675b68 100755 --- a/superstyl/preproc/pipe.py +++ b/superstyl/preproc/pipe.py @@ -1,5 +1,3 @@ -import unicodedata - from lxml import etree import regex as re import unidecode @@ -52,6 +50,153 @@ def XML_to_text(path): return aut, re.sub(r"\s+", " ", str(myxsl(my_doc))) +def txm_to_units(path, units="lines", feats="words"): + """ + Extract units from TXM file + :param path: path to TXM file + :param units: units to extract ("lines"/"verses" or "words") + :param feats: features to extract ("words", "lemma", or "pos") + :return: list of extracted units + """ + myxsl = etree.XML(''' + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +''') + myxsl = etree.XSLT(myxsl) + + with open(path, 'r') as f: + my_doc = etree.parse(f) + + units_tokens = str(myxsl(my_doc, units=etree.XSLT.strparam(units), feats=etree.XSLT.strparam(feats))).splitlines() + return units_tokens + +def tei_to_units(path, feats="words", units="lines"): + + if feats in ["met_syll", "met_line"]: + feats = "met" + myxsl = etree.XML(''' + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ''') + myxsl = etree.XSLT(myxsl) + + with open(path, 'r') as f: + my_doc = etree.parse(f) + + units_tokens = str(myxsl(my_doc, units=etree.XSLT.strparam(units), feats=etree.XSLT.strparam(feats))).splitlines() + return units_tokens + +def specialXML_to_text(path, format="tei", feats="words"): + aut = path.split('/')[-1].split("_")[0] + if format=="tei": + units_tokens = tei_to_units(path, feats=feats, units="words") + + if format=="txm": + units_tokens = txm_to_units(path, feats=feats, units="words") + + return aut, re.sub(r"\s+", " ", str(' '.join(units_tokens))) + def TXT_to_text(path): """ Get main text from xml file @@ -147,7 +292,7 @@ def load_texts(paths, identify_lang=False, feats="words", format="txt", keep_pun Loads a collection of documents into a 'myTexts' object for further processing. TODO: a proper class :param paths: path to docs - TODO: add feats! + :param feats: the type of features, one of 'words', 'chars', 'affixes, 'lemma', 'pos', 'met_line' and 'met_syll'. :param identify_lang: whether or not try to identify lang (default: False) :param format: format of the source files (implemented values: txt [default], xml) :param keep_punct: whether or not to keep punctuation and caps. @@ -165,6 +310,9 @@ def load_texts(paths, identify_lang=False, feats="words", format="txt", keep_pun if format=='xml': aut, text = XML_to_text(path) + if format in ('tei', 'txm'): + aut, text = specialXML_to_text(path, format=format, feats=feats) + else: aut, text = TXT_to_text(path) @@ -198,7 +346,7 @@ def get_samples(path, size, step=None, samples_random=False, max_samples=10, :param max_samples: maximum number of samples per author/clas :param units: the units to use, one of "words" or "verses" :param format: type of document, one of full text, TEI or simple XML (ONLY TEI and TXT IMPLEMENTED) - :param feats: the type of features, TODO: document + :param feats: the type of features, one of 'words', 'chars', 'affixes, 'lemma', 'pos', 'met_line' and 'met_syll'. """ if samples_random and step is not None: @@ -213,127 +361,26 @@ def get_samples(path, size, step=None, samples_random=False, max_samples=10, if units == "words" and format == "txt": my_doc = TXT_to_text(path) text = normalise(my_doc[1], keep_punct=keep_punct, keep_sym=keep_sym, no_ascii=no_ascii) - units = nltk.tokenize.wordpunct_tokenize(text) + units_tokens = nltk.tokenize.wordpunct_tokenize(text) - #TODO: DOCUMENT this format as TXM, and keep it only for retrocompatibility + #Kept only for retrocompatibility with Psysché if units == "verses" and format == "txm": - myxsl = etree.XML(''' - - - - - - - - - - - - - - - - - - ''') - myxsl = etree.XSLT(myxsl) - - with open(path, 'r') as f: - my_doc = etree.parse(f) - - units = str(myxsl(my_doc)).splitlines() - - # and now generating output - samples = [] + units_tokens = txm_to_units(path, units=units) if format == "tei": - if feats in ["met_syll", "met_line"]: - feats = "met" - myxsl = etree.XML(''' - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -''') - myxsl = etree.XSLT(myxsl) - - with open(path, 'r') as f: - my_doc = etree.parse(f) - - units = str(myxsl(my_doc, units=etree.XSLT.strparam(units), feats=etree.XSLT.strparam(feats))).splitlines() + units_tokens = tei_to_units(path, units=units, feats=feats) # and now generating output samples = [] if samples_random: for k in range(max_samples): - samples.append({"start": str(k)+'s', "end": str(k)+'e', "text": list(random.choices(units, k=size))}) + samples.append({"start": str(k)+'s', "end": str(k)+'e', "text": list(random.choices(units_tokens, k=size))}) else: current = 0 - while current + size <= len(units): - samples.append({"start": current, "end": current + size, "text": list(units[current:(current + size)])}) + while current + size <= len(units_tokens): + samples.append({"start": current, "end": current + size, "text": list(units_tokens[current:(current + size)])}) current = current + step return samples @@ -353,7 +400,7 @@ def docs_to_samples(paths, size, step=None, units="words", samples_random=False, :param keep_punct: whether to keep punctuation and caps. :param max_samples: maximum number of samples per author/class. :param identify_lang: whether to try to identify lang (default: False) - :param feats: TODO + :param feats: the type of features, one of 'words', 'chars', 'affixes, 'lemma', 'pos', 'met_line' and 'met_syll'. :return: a myTexts object """ myTexts = [] diff --git a/superstyl/svm.py b/superstyl/svm.py index 204987a5..abceb823 100755 --- a/superstyl/svm.py +++ b/superstyl/svm.py @@ -288,13 +288,13 @@ def plot_coefficients(coefs, feature_names, current_class, top_features=10): -def plot_rolling(final_predictions, smoothing=3): +def plot_rolling(final_predictions, smoothing=3, xlab = "Index (segment center)"): """ Plots the rolling stylometry results as lines of decision function values over the text. Parameters: - final_predictions_path : str - Path to the CSV file containing final predictions generated by the SVM pipeline. + final_predictions : Pandas dataframe containing the final predictions out of train_svm + . smoothing : int or None The window size for smoothing the curves. @@ -304,24 +304,25 @@ def plot_rolling(final_predictions, smoothing=3): """ # Extract the segment center from the filename + my_final_predictions = final_predictions.copy() # to avoid modifying in place segment_centers = [] - for fname in final_predictions['filename']: + for fname in my_final_predictions['filename']: parts = fname.split('_')[-1].split('-') start = int(parts[0]) end = int(parts[1]) center = (start + end) / 2.0 segment_centers.append(center) - final_predictions['segment_center'] = segment_centers + my_final_predictions['segment_center'] = segment_centers - final_predictions['filename'] = [fname.split('_')[1] for fname in final_predictions['filename']] + my_final_predictions['filename'] = [fname.split('_')[1] for fname in my_final_predictions['filename']] # Identify candidate columns known_cols = {'filename', 'author', 'segment_center'} - candidate_cols = [c for c in final_predictions.columns if c not in known_cols] + candidate_cols = [c for c in my_final_predictions.columns if c not in known_cols] - for work in final_predictions['filename'].unique(): - fpreds_work = final_predictions[final_predictions['filename'] == work] + for work in my_final_predictions['filename'].unique(): + fpreds_work = my_final_predictions[my_final_predictions['filename'] == work] # Sort by segment center to ensure chronological order fpreds_work = fpreds_work.sort_values('segment_center') @@ -336,7 +337,7 @@ def plot_rolling(final_predictions, smoothing=3): plt.plot(fpreds_work['segment_center'], fpreds_work[col], label=col, linewidth=2) plt.title('Rolling Stylometry Decision Functions Over ' + work) - plt.xlabel('Word index (segment center)') + plt.xlabel(xlab) plt.ylabel('Decision Function Value') plt.ylim(min(-2, min(fpreds_work[candidate_cols].min()) - 0.2), max(1, max(fpreds_work[candidate_cols].max())) + 0.2) diff --git a/tests/test_error_handling.py b/tests/test_error_handling.py new file mode 100644 index 00000000..cc37116a --- /dev/null +++ b/tests/test_error_handling.py @@ -0,0 +1,320 @@ +import unittest +import superstyl.load +import superstyl.preproc.features_extract +from superstyl.load_from_config import load_corpus_from_config +import os +import tempfile +import json +import glob + +THIS_DIR = os.path.dirname(os.path.abspath(__file__)) + + +class ErrorHandlingTests(unittest.TestCase): + """Tests for error handling and ValueError raising""" + + def setUp(self): + """Set up test files paths""" + self.test_paths = sorted(glob.glob(os.path.join(THIS_DIR, "testdata/*.txt"))) + self.temp_dir = tempfile.TemporaryDirectory() + + def tearDown(self): + """Clean up temporary directory""" + self.temp_dir.cleanup() + + # ========================================================================= + # Tests pour load.py - ValueError pour formats incompatibles + # ========================================================================= + + def test_load_corpus_lemma_requires_tei(self): + # SCENARIO: lemma features require TEI format + # GIVEN: Attempting to use lemma with non-TEI format + + # WHEN/THEN: Should raise ValueError + with self.assertRaises(ValueError) as context: + superstyl.load.load_corpus( + self.test_paths, + feats="lemma", + format="txt" + ) + + self.assertIn("lemma", str(context.exception)) + self.assertIn("tei", str(context.exception).lower()) + + def test_load_corpus_pos_requires_tei(self): + # SCENARIO: pos features require TEI format + # GIVEN: Attempting to use pos with non-TEI format + + # WHEN/THEN: Should raise ValueError + with self.assertRaises(ValueError) as context: + superstyl.load.load_corpus( + self.test_paths, + feats="pos", + format="txt" + ) + + self.assertIn("pos", str(context.exception)) + self.assertIn("tei", str(context.exception).lower()) + + def test_load_corpus_met_line_requires_tei(self): + # SCENARIO: met_line features require TEI format + # GIVEN: Attempting to use met_line with non-TEI format + + # WHEN/THEN: Should raise ValueError + with self.assertRaises(ValueError) as context: + superstyl.load.load_corpus( + self.test_paths, + feats="met_line", + format="txt" + ) + + self.assertIn("met_line", str(context.exception)) + self.assertIn("tei", str(context.exception).lower()) + + def test_load_corpus_met_syll_requires_tei(self): + # SCENARIO: met_syll features require TEI format + # GIVEN: Attempting to use met_syll with non-TEI format + + # WHEN/THEN: Should raise ValueError + with self.assertRaises(ValueError) as context: + superstyl.load.load_corpus( + self.test_paths, + feats="met_syll", + format="txt" + ) + + self.assertIn("met_syll", str(context.exception)) + self.assertIn("tei", str(context.exception).lower()) + + def test_load_corpus_met_line_requires_lines_unit(self): + # SCENARIO: met_line requires units='lines' + # GIVEN: Attempting to use met_line with units='words' + + # Create a dummy TEI file for this test + tei_path = os.path.join(self.temp_dir.name, "test_met.xml") + with open(tei_path, 'w') as f: + f.write('test') + + # WHEN/THEN: Should raise ValueError + with self.assertRaises(ValueError) as context: + superstyl.load.load_corpus( + [tei_path], + feats="met_line", + format="tei", + units="words" # Wrong unit type + ) + + self.assertIn("met_line", str(context.exception)) + self.assertIn("lines", str(context.exception)) + + def test_load_corpus_met_syll_requires_lines_unit(self): + # SCENARIO: met_syll requires units='lines' + # GIVEN: Attempting to use met_syll with units='words' + + # Create a dummy TEI file for this test + tei_path = os.path.join(self.temp_dir.name, "test_met2.xml") + with open(tei_path, 'w') as f: + f.write('test') + + # WHEN/THEN: Should raise ValueError + with self.assertRaises(ValueError) as context: + superstyl.load.load_corpus( + [tei_path], + feats="met_syll", + format="tei", + units="words" # Wrong unit type + ) + + self.assertIn("met_syll", str(context.exception)) + self.assertIn("lines", str(context.exception)) + + # ========================================================================= + # Tests pour features_extract.py - ValueError pour paramètres invalides + # ========================================================================= + + def test_count_features_empty_text(self): + # SCENARIO: Empty text should raise ValueError + # GIVEN: An empty string as text + + # WHEN/THEN: Should raise ValueError + with self.assertRaises(ValueError) as context: + superstyl.preproc.features_extract.count_features( + "", # Empty text + feats="words", + n=1 + ) + + self.assertIn("empty", str(context.exception).lower()) + + def test_count_features_invalid_n_zero(self): + # SCENARIO: n must be positive + # GIVEN: n=0 + + # WHEN/THEN: Should raise ValueError + with self.assertRaises(ValueError) as context: + superstyl.preproc.features_extract.count_features( + "test text", + feats="words", + n=0 # Invalid n + ) + + self.assertIn("positive", str(context.exception).lower()) + + def test_count_features_invalid_n_negative(self): + # SCENARIO: n must be positive + # GIVEN: n=-1 + + # WHEN/THEN: Should raise ValueError + with self.assertRaises(ValueError) as context: + superstyl.preproc.features_extract.count_features( + "test text", + feats="words", + n=-1 # Invalid n + ) + + self.assertIn("positive", str(context.exception).lower()) + + def test_count_features_invalid_n_not_integer(self): + # SCENARIO: n must be an integer + # GIVEN: n=1.5 (float) + + # WHEN/THEN: Should raise ValueError + with self.assertRaises(ValueError) as context: + superstyl.preproc.features_extract.count_features( + "test text", + feats="words", + n=1.5 # Not an integer + ) + + self.assertIn("integer", str(context.exception).lower()) + + def test_count_features_invalid_not_string(self): + # SCENARIO: text must be a string + # GIVEN: text is not a string (e.g., None) + + # WHEN/THEN: Should raise ValueError + with self.assertRaises(ValueError) as context: + superstyl.preproc.features_extract.count_features( + None, # Not a string + feats="words", + n=1 + ) + + self.assertIn("string", str(context.exception).lower()) + + def test_count_features_unsupported_feats_type(self): + # SCENARIO: feats must be a supported type + # GIVEN: An unsupported feats type + + # WHEN/THEN: Should raise ValueError + with self.assertRaises(ValueError) as context: + superstyl.preproc.features_extract.count_features( + "test text", + feats="unsupported_type", # Invalid feats type + n=1 + ) + + self.assertIn("Unsupported", str(context.exception)) + + def test_get_counts_invalid_frequency_type(self): + # SCENARIO: freqsType must be valid + # GIVEN: An unsupported frequency type + + myTexts = [{"name": "test", "text": "test text"}] + + # WHEN/THEN: Should raise ValueError + with self.assertRaises(ValueError) as context: + superstyl.preproc.features_extract.get_counts( + myTexts, + feats="words", + freqsType="invalid_type" # Invalid frequency type + ) + + self.assertIn("Unsupported frequency type", str(context.exception)) + + # ========================================================================= + # Tests pour load_from_config.py - Branches non couvertes + # ========================================================================= + + def test_load_from_config_with_json_feature_list(self): + # SCENARIO: Load corpus with JSON feature list (ligne 119) + # GIVEN: A config with a JSON feature list + + # Create a JSON feature list + feature_list = [["the", 0], ["is", 0]] + feature_list_path = os.path.join(self.temp_dir.name, "features.json") + with open(feature_list_path, 'w') as f: + json.dump(feature_list, f) + + # Create config + config = { + "paths": self.test_paths, + "format": "txt", + "features": [ + { + "name": "test_feature", + "type": "words", + "n": 1, + "feat_list": feature_list_path # JSON feature list + } + ] + } + + config_path = os.path.join(self.temp_dir.name, "config.json") + with open(config_path, 'w') as f: + json.dump(config, f) + + # WHEN: Loading corpus from config + corpus, features = load_corpus_from_config(config_path) + + # THEN: Should load successfully with JSON feature list + self.assertIsNotNone(corpus) + self.assertIsNotNone(features) + + def test_load_from_config_test_mode_uses_feat_list(self): + # SCENARIO: In test mode, use provided feat_list (ligne 156) + # GIVEN: A config with feat_list in test mode + + # Create a JSON feature list + feature_list = [["the", 0], ["is", 0], ["text", 0]] + feature_list_path = os.path.join(self.temp_dir.name, "test_features.json") + with open(feature_list_path, 'w') as f: + json.dump(feature_list, f) + + # Create config with multiple features (triggers is_test logic) + config = { + "paths": self.test_paths, + "format": "txt", + "features": [ + { + "name": "feat1", + "type": "words", + "n": 1, + "feat_list": feature_list_path + }, + { + "name": "feat2", + "type": "chars", + "n": 2, + "feat_list": feature_list_path + } + ] + } + + config_path = os.path.join(self.temp_dir.name, "multi_config.json") + with open(config_path, 'w') as f: + json.dump(config, f) + + # WHEN: Loading corpus from config + corpus, features = load_corpus_from_config(config_path, is_test=True) + + # THEN: Should use the provided feature list + self.assertIsNotNone(corpus) + self.assertIsNotNone(features) + # features should be a list of feature lists + self.assertIsInstance(features, list) + self.assertEqual(len(features), 2) # Two feature sets + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tests/test_load_corpus.py b/tests/test_load_corpus.py index 5de65d26..076aa199 100644 --- a/tests/test_load_corpus.py +++ b/tests/test_load_corpus.py @@ -286,22 +286,7 @@ def test_load_corpus(self): self.assertEqual(sorted(feats), sorted(expected_feats)) self.assertEqual(corpus.to_dict(), expected_corpus) - # WHEN - corpus, feats = superstyl.load.load_corpus(sorted(self.paths[1:]), feats="pos", n=1, format="txt", freqsType="absolute") - - # THEN - expected_feats = [('DT', 4), ('NN', 2), ('VBZ', 2), ('RB', 1)] - expected_corpus = { - 'author': {'Smith_Letter1.txt': 'Smith', 'Smith_Letter2.txt': 'Smith'}, - 'lang': {'Smith_Letter1.txt': 'NA', 'Smith_Letter2.txt': 'NA'}, - 'DT': {'Smith_Letter1.txt': 2 , 'Smith_Letter2.txt': 2}, - 'NN': {'Smith_Letter1.txt': 1 , 'Smith_Letter2.txt': 1}, - 'VBZ': {'Smith_Letter1.txt': 1, 'Smith_Letter2.txt': 1}, - 'RB': {'Smith_Letter1.txt': 0, 'Smith_Letter2.txt': 1} - } - - self.assertEqual(sorted(feats), sorted(expected_feats)) - self.assertEqual(corpus.to_dict(), expected_corpus) + # TODO: add tests for lemma, pos, met_line, met_syll, and loading from tei, and from txm # Now, test embedding # WHEN diff --git a/tests/test_xml_loading.py b/tests/test_xml_loading.py new file mode 100644 index 00000000..c874bbb8 --- /dev/null +++ b/tests/test_xml_loading.py @@ -0,0 +1,304 @@ +import unittest +import superstyl.preproc.pipe +import os +import glob + +THIS_DIR = os.path.dirname(os.path.abspath(__file__)) + + +class XMLLoadingTests(unittest.TestCase): + """Tests for XML, TEI, and TXM file loading functions""" + + def setUp(self): + """Set up test files paths""" + self.xml_path = os.path.join(THIS_DIR, "testdata", "Smith_Song1.xml") + self.tei_path = os.path.join(THIS_DIR, "testdata", "Dupont_TEIPoem1.xml") + self.txm_path = os.path.join(THIS_DIR, "testdata", "Smith_TXM1.xml") + + def test_XML_to_text(self): + # SCENARIO: Load text from a simple XML file + # GIVEN: An XML file with author and text elements + + # WHEN: Loading the XML file + aut, text = superstyl.preproc.pipe.XML_to_text(self.xml_path) + + # THEN: Author and text are correctly extracted + self.assertEqual(aut, "Smith") + self.assertIn("test song", text) + self.assertIn("lyrics", text) + # Check that whitespace is normalized + self.assertNotIn(" ", text) + + def test_tei_to_units_words(self): + # SCENARIO: Extract words from a TEI file + # GIVEN: A TEI file with annotated words + + # WHEN: Extracting words as units + units_tokens = superstyl.preproc.pipe.tei_to_units( + self.tei_path, + feats="words", + units="words" + ) + + # THEN: Words are extracted, one per line + self.assertIsInstance(units_tokens, list) + self.assertGreater(len(units_tokens), 0) + # Each word should be on a separate line + self.assertIn("This", [u.strip() for u in units_tokens]) + self.assertIn("is", [u.strip() for u in units_tokens]) + + def test_tei_to_units_verses(self): + # SCENARIO: Extract verses (lines) from a TEI file + # GIVEN: A TEI file with verse lines + + # WHEN: Extracting verses as units + units_tokens = superstyl.preproc.pipe.tei_to_units( + self.tei_path, + feats="words", + units="verses" + ) + + # THEN: Each verse is on a separate line + self.assertIsInstance(units_tokens, list) + # We should have 2 lines in our test file + self.assertEqual(len(units_tokens), 2) + + def test_tei_to_units_lemma(self): + # SCENARIO: Extract lemmas from a TEI file + # GIVEN: A TEI file with lemma annotations + + # WHEN: Extracting lemmas + units_tokens = superstyl.preproc.pipe.tei_to_units( + self.tei_path, + feats="lemma", + units="words" + ) + + # THEN: Lemmas are extracted + self.assertIsInstance(units_tokens, list) + self.assertIn("this", [u.strip() for u in units_tokens]) + self.assertIn("be", [u.strip() for u in units_tokens]) + + def test_tei_to_units_pos(self): + # SCENARIO: Extract POS tags from a TEI file + # GIVEN: A TEI file with POS annotations + + # WHEN: Extracting POS tags + units_tokens = superstyl.preproc.pipe.tei_to_units( + self.tei_path, + feats="pos", + units="words" + ) + + # THEN: POS tags are extracted + self.assertIsInstance(units_tokens, list) + self.assertIn("DET", [u.strip() for u in units_tokens]) + self.assertIn("VERB", [u.strip() for u in units_tokens]) + + def test_tei_to_units_met_syll(self): + # SCENARIO: Extract metrical syllables from a TEI file + # GIVEN: A TEI file with metrical annotations + + # WHEN: Extracting metrical syllables with met_syll feature + units_tokens = superstyl.preproc.pipe.tei_to_units( + self.tei_path, + feats="met_syll", + units="verses" + ) + + # THEN: Metrical annotations are extracted + self.assertIsInstance(units_tokens, list) + # The @met attributes should be present + self.assertGreater(len(units_tokens), 0) + + def test_tei_to_units_met_line(self): + # SCENARIO: Extract metrical lines from a TEI file + # GIVEN: A TEI file with metrical annotations on lines + + # WHEN: Extracting metrical patterns at line level + units_tokens = superstyl.preproc.pipe.tei_to_units( + self.tei_path, + feats="met_line", + units="verses" + ) + + # THEN: Metrical patterns for each line are extracted + self.assertIsInstance(units_tokens, list) + self.assertEqual(len(units_tokens), 2) + # Should contain the metrical patterns + self.assertIn("01010101", units_tokens[0]) + self.assertIn("10101010", units_tokens[1]) + + def test_txm_to_units_words(self): + # SCENARIO: Extract words from a TXM file + # GIVEN: A TXM file with annotated words + + # WHEN: Extracting words as units + units_tokens = superstyl.preproc.pipe.txm_to_units( + self.txm_path, + units="words" + ) + + # THEN: Words are extracted + # Note: When extracting individual words (units='words'), + # the NOMpro filter is not applied + self.assertIsInstance(units_tokens, list) + self.assertGreater(len(units_tokens), 0) + text_content = ' '.join(units_tokens) + # All words should be present including those with NOMpro + self.assertIn("This", text_content) + self.assertIn("test", text_content) + + def test_txm_to_units_verses(self): + # SCENARIO: Extract verses from a TXM file + # GIVEN: A TXM file with verse lines + + # WHEN: Extracting verses as units + units_tokens = superstyl.preproc.pipe.txm_to_units( + self.txm_path, + units="verses" + ) + + # THEN: Each verse is extracted and NOMpro words are filtered out + self.assertIsInstance(units_tokens, list) + self.assertEqual(len(units_tokens), 2) + # Check that NOMpro words are excluded in verse mode + text_content = ' '.join(units_tokens) + self.assertNotIn("here", text_content) # "here" has NOMpro tag and should be filtered + self.assertIn("This", text_content) # Regular words should be present + + def test_txm_to_units_lemma(self): + # SCENARIO: Extract lemmas from a TXM file + # GIVEN: A TXM file with lemma annotations + + # WHEN: Extracting lemmas + units_tokens = superstyl.preproc.pipe.txm_to_units( + self.txm_path, + units="words", + feats="lemma" + ) + + # THEN: Lemmas are extracted + self.assertIsInstance(units_tokens, list) + self.assertIn("be", [u.strip() for u in units_tokens]) # lemma of "is" + self.assertIn("this", [u.strip() for u in units_tokens]) + + def test_txm_to_units_pos(self): + # SCENARIO: Extract POS tags from a TXM file + # GIVEN: A TXM file with POS annotations + + # WHEN: Extracting POS tags + units_tokens = superstyl.preproc.pipe.txm_to_units( + self.txm_path, + units="words", + feats="pos" + ) + + # THEN: POS tags are extracted + self.assertIsInstance(units_tokens, list) + self.assertIn("DET", [u.strip() for u in units_tokens]) + self.assertIn("VERB", [u.strip() for u in units_tokens]) + + def test_specialXML_to_text_tei(self): + # SCENARIO: Load text from a TEI file using specialXML_to_text + # GIVEN: A TEI format file + + # WHEN: Loading with format="tei" + aut, text = superstyl.preproc.pipe.specialXML_to_text( + self.tei_path, + format="tei", + feats="words" + ) + + # THEN: Author is extracted from filename and text is normalized + self.assertEqual(aut, "Dupont") + self.assertIsInstance(text, str) + self.assertGreater(len(text), 0) + # Check that whitespace is normalized (single spaces) + self.assertNotIn(" ", text) + + def test_specialXML_to_text_txm(self): + # SCENARIO: Load text from a TXM file using specialXML_to_text + # GIVEN: A TXM format file + + # WHEN: Loading with format="txm" + aut, text = superstyl.preproc.pipe.specialXML_to_text( + self.txm_path, + format="txm", + feats="words" + ) + + # THEN: Author is extracted from filename and text is normalized + self.assertEqual(aut, "Smith") + self.assertIsInstance(text, str) + self.assertGreater(len(text), 0) + # Text should contain words from the TXM file + self.assertIn("test", text.lower()) + + def test_specialXML_to_text_with_lemma(self): + # SCENARIO: Load lemmas from a TEI file + # GIVEN: A TEI file with lemma annotations + + # WHEN: Loading with feats="lemma" + aut, text = superstyl.preproc.pipe.specialXML_to_text( + self.tei_path, + format="tei", + feats="lemma" + ) + + # THEN: Lemmas are in the text + self.assertEqual(aut, "Dupont") + self.assertIn("be", text) # lemma of "is" + self.assertIn("this", text) + + def test_specialXML_to_text_with_pos(self): + # SCENARIO: Load POS tags from a TEI file + # GIVEN: A TEI file with POS annotations + + # WHEN: Loading with feats="pos" + aut, text = superstyl.preproc.pipe.specialXML_to_text( + self.tei_path, + format="tei", + feats="pos" + ) + + # THEN: POS tags are in the text + self.assertEqual(aut, "Dupont") + self.assertIn("DET", text) + self.assertIn("VERB", text) + + def test_specialXML_to_text_txm_with_lemma(self): + # SCENARIO: Load lemmas from a TXM file + # GIVEN: A TXM file with lemma annotations + + # WHEN: Loading with feats="lemma" + aut, text = superstyl.preproc.pipe.specialXML_to_text( + self.txm_path, + format="txm", + feats="lemma" + ) + + # THEN: Lemmas are in the text + self.assertEqual(aut, "Smith") + self.assertIn("be", text) # lemma of "is" + self.assertIn("this", text) + + def test_specialXML_to_text_txm_with_pos(self): + # SCENARIO: Load POS tags from a TXM file + # GIVEN: A TXM file with POS annotations + + # WHEN: Loading with feats="pos" + aut, text = superstyl.preproc.pipe.specialXML_to_text( + self.txm_path, + format="txm", + feats="pos" + ) + + # THEN: POS tags are in the text + self.assertEqual(aut, "Smith") + self.assertIn("DET", text) + self.assertIn("VERB", text) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tests/testdata/Dupont_TEIPoem1.xml b/tests/testdata/Dupont_TEIPoem1.xml new file mode 100644 index 00000000..a737babd --- /dev/null +++ b/tests/testdata/Dupont_TEIPoem1.xml @@ -0,0 +1,30 @@ + + + + + + Test Poem + + + + + + + + This + is + the + first + line + + + And + this + is + the + second + + + + + \ No newline at end of file diff --git a/tests/testdata/Smith_Song1.xml b/tests/testdata/Smith_Song1.xml new file mode 100644 index 00000000..d713cc96 --- /dev/null +++ b/tests/testdata/Smith_Song1.xml @@ -0,0 +1,5 @@ + + + Smith + This is a test song with some lyrics + \ No newline at end of file diff --git a/tests/testdata/Smith_TXM1.xml b/tests/testdata/Smith_TXM1.xml new file mode 100644 index 00000000..0411bf57 --- /dev/null +++ b/tests/testdata/Smith_TXM1.xml @@ -0,0 +1,55 @@ + + + + + + Test TXM Text + + + + + + + + + This + this + DET + + + is + be + VERB + + + a + a + DET + + + test + test + NOUN + + + + + Second + second + ADJ + + + line + line + NOUN + + + here + here + NOMpro + + + + + + \ No newline at end of file