diff --git a/.coverage b/.coverage
new file mode 100644
index 00000000..cb8c5ebe
Binary files /dev/null and b/.coverage differ
diff --git a/load_corpus.py b/load_corpus.py
index f325f30d..668e7524 100755
--- a/load_corpus.py
+++ b/load_corpus.py
@@ -18,18 +18,19 @@
parser.add_argument('-f', action="store", help="optional list of features, either in json (generated by"
" Superstyl) or simple txt (one word per line)", default=False)
parser.add_argument('-t', action='store', help="types of features (words, chars, affixes - "
- "as per Sapkota et al. 2015 - or pos). pos are currently"
- "only implemented for Modern English", type=str,
- default="words", choices=["words", "chars", "affixes", "pos"])
+ "as per Sapkota et al. 2015 -, as well as lemma or pos, met_line, "
+ "met_syll (those four last only for TEI files with proper annotation)"
+ , type=str,
+ default="words", choices=["words", "chars", "affixes", "pos", "lemma", "met_line", "met_syll"])
parser.add_argument('-n', action='store', help="n grams lengths (default 1)", default=1, type=int)
parser.add_argument('-k', action='store', help="How many most frequent?", default=5000, type=int)
parser.add_argument('--freqs', action='store', help="relative, absolute or binarised freqs",
default="relative",
choices=["relative", "absolute", "binary"]
)
- parser.add_argument('-x', action='store', help="format (txt, xml or tei) WARNING: only txt is fully implemented",
+ parser.add_argument('-x', action='store', help="format (txt, xml, tei, or txm) WARNING: only txt is fully implemented",
default="txt",
- choices=["txt", "xml", "tei"]
+ choices=["txt", "xml", "tei", 'txm']
)
parser.add_argument('--sampling', action='store_true', help="Sample the texts?", default=False)
parser.add_argument('--sample_units', action='store', help="Units of length for sampling "
diff --git a/superstyl/load.py b/superstyl/load.py
index 4528c6f6..fe52683e 100644
--- a/superstyl/load.py
+++ b/superstyl/load.py
@@ -14,10 +14,11 @@ def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, freqsTyp
Main function to load a corpus from a collection of file, and an optional list of features to extract.
:param data_paths: paths to the source files
:param feat_list: an optional list of features (as created by load_corpus), default None
- :param feats: the type of features, one of 'words', 'chars', 'affixes, and 'POS'. Affixes are inspired by
- Sapkota et al. 2015, and include space_prefix, space_suffix, prefix, suffix, and, if keep_pos, punctuation n-grams.
- POS are currently only implemented for Modern English
- TODO: add met_line, met_syll
+ :param feats: the type of features, one of 'words', 'chars', 'affixes, 'lemma', 'pos', 'met_line' and 'met_syll'.
+ Affixes are inspired by Sapkota et al. 2015, and include space_prefix, space_suffix, prefix, suffix, and,
+ if keep_punct, punctuation n-grams. From TEI, pos, lemma, met_line or met_syll can
+ be extracted; met_line is the prosodic (stress) annotation of a full verse; met_syll is a char n-gram of prosodic
+ annotation
:param n: n grams lengths (default 1)
:param k: How many most frequent? The function takes the rank of k (if k is smaller than the total number of features),
gets its frequencies, and only include features of superior or equal total frequencies.
@@ -45,6 +46,12 @@ def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, freqsTyp
:return a pandas dataFrame of text metadata and feature frequencies; a global list of features with their frequencies
"""
+ if feats in ('lemma', 'pos', 'met_line', 'met_syll') and format != 'tei':
+ raise ValueError("lemma, pos, met_line or met_syll are only possible with adequate tei format (@lemma, @pos, @met)")
+
+ if feats in ('met_line', 'met_syll') and units != 'lines':
+ raise ValueError("met_line or met_syll are only possible with tei format that includes lines and @met")
+
embeddedFreqs = False
if embedding:
print(".......loading embedding.......")
@@ -81,7 +88,6 @@ def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, freqsTyp
my_feats = [m[0] for m in feat_list] # keeping only the features without the frequencies
myTexts = fex.get_counts(myTexts, feat_list=my_feats, feats=feats, n=n, freqsType=freqsType)
-
if embedding:
print(".......embedding counts.......")
myTexts, my_feats = embed.get_embedded_counts(myTexts, my_feats, model, topn=neighbouring_size)
diff --git a/superstyl/load_from_config.py b/superstyl/load_from_config.py
index 103dd853..4bc7cca4 100644
--- a/superstyl/load_from_config.py
+++ b/superstyl/load_from_config.py
@@ -1,12 +1,11 @@
import json
-import superstyl
import pandas as pd
import os
import glob
from superstyl.load import load_corpus
-def load_corpus_from_config(config_path):
+def load_corpus_from_config(config_path, is_test=False):
"""
Load a corpus based on a JSON configuration file.
@@ -55,7 +54,7 @@ def load_corpus_from_config(config_path):
# Get sampling parameters
sampling_params = config.get('sampling', {})
-
+
# Use the first feature to create the base corpus with sampling
feature_configs = config.get('features', [])
if not feature_configs:
@@ -87,9 +86,9 @@ def load_corpus_from_config(config_path):
'sampling': sampling_params.get('enabled', False),
'units': sampling_params.get('units', 'words'),
'size': sampling_params.get('sample_size', 3000),
- 'step': sampling_params.get('sample_step', None),
+ 'step': sampling_params.get('step', None),
'max_samples': sampling_params.get('max_samples', None),
- 'samples_random': sampling_params.get('sample_random', False),
+ 'samples_random': sampling_params.get('samples_random', False),
'keep_punct': feature_config.get('keep_punct', False),
'keep_sym': feature_config.get('keep_sym', False),
'no_ascii': feature_config.get('no_ascii', False),
@@ -115,6 +114,7 @@ def load_corpus_from_config(config_path):
# Check for feature list file
feat_list = None
feat_list_path = feature_config.get('feat_list')
+ print(feat_list_path)
if feat_list_path:
if feat_list_path.endswith('.json'):
with open(feat_list_path, 'r') as f:
@@ -133,9 +133,9 @@ def load_corpus_from_config(config_path):
'sampling': sampling_params.get('enabled', False),
'units': sampling_params.get('units', 'words'),
'size': sampling_params.get('sample_size', 3000),
- 'step': sampling_params.get('sample_step', None),
+ 'step': sampling_params.get('step', None),
'max_samples': sampling_params.get('max_samples', None),
- 'samples_random': sampling_params.get('sample_random', False),
+ 'samples_random': sampling_params.get('samples_random', False),
'keep_punct': config.get('keep_punct', False),
'keep_sym': config.get('keep_sym', False),
'no_ascii': config.get('no_ascii', False),
@@ -146,11 +146,17 @@ def load_corpus_from_config(config_path):
}
print(f"Loading {feature_name}...")
+
corpus, features = load_corpus(paths, feat_list=feat_list, **params)
# Store corpus and features
corpora[feature_name] = corpus
- feature_lists[feature_name] = features
+
+ if feat_list is not None and is_test:
+ feature_lists[feature_name] = feat_list
+ else:
+ feature_lists[feature_name] = features
+
# Create a merged dataset
print("Creating merged dataset...")
@@ -170,6 +176,8 @@ def load_corpus_from_config(config_path):
# Add features from each corpus
for name, corpus in corpora.items():
+ single_feature = []
+
feature_cols = [col for col in corpus.columns if col not in ['author', 'lang']]
# Rename columns to avoid duplicates
@@ -181,8 +189,9 @@ def load_corpus_from_config(config_path):
# Add features to the combined list with prefixes
for feature in feature_lists[name]:
- all_features.append((f"{name}_{feature[0]}", feature[1]))
+ single_feature.append((feature[0], feature[1]))
+ all_features.append(single_feature)
# Return the merged corpus and combined feature list
return merged, all_features
diff --git a/superstyl/preproc/features_extract.py b/superstyl/preproc/features_extract.py
index fe6ba5c0..d04d8c12 100755
--- a/superstyl/preproc/features_extract.py
+++ b/superstyl/preproc/features_extract.py
@@ -10,7 +10,7 @@ def count_features(text, feats ="words", n = 1):
Get feature counts from a text (words, chars or POS n-grams, or affixes(+punct if keep_punct),
following Sapkota et al., NAACL 2015
:param text: the source text
- :param feats: the type of feats: words, chars, POS (supported only for English), or affixes
+ :param feats: the type of features, one of 'words', 'chars', 'affixes, 'lemma', 'pos', 'met_line' and 'met_syll'.
:param n: the length of n-grams
:return: features absolute frequencies in text as a counter, and the total of frequencies
"""
@@ -20,9 +20,9 @@ def count_features(text, feats ="words", n = 1):
raise ValueError("Text cannot be empty.")
if n < 1 or not isinstance(n, int):
raise ValueError("n must be a positive integer.")
- if feats not in ["words", "chars", "affixes", "pos", "met_line", "met_syll"]:
- raise ValueError("Unsupported feature type. Choose from 'words', 'chars', 'affixes', 'met_line', 'met_syll', or 'pos'.")
- if feats == "words":
+ if feats not in ["words", "chars", "affixes", "lemma", "pos", "met_line", "met_syll"]:
+ raise ValueError("Unsupported feature type. Choose from 'words', 'chars', 'affixes', 'met_line', 'met_syll', 'lemma' or 'pos'.")
+ if feats in ("words", "lemma", "pos"):
tokens = nltk.tokenize.wordpunct_tokenize(text)
if n > 1:
tokens = ["_".join(t) for t in list(nltk.ngrams(tokens, n))]
@@ -46,20 +46,6 @@ def count_features(text, feats ="words", n = 1):
]
tokens = affs + space_affs_and_punct
- #POS in english with NLTK - need to propose spacy later on
- elif feats == "pos":
- try:
- nltk.data.find('taggers/averaged_perceptron_tagger_eng')
- except:
- nltk.download('averaged_perceptron_tagger_eng')
- words = nltk.tokenize.wordpunct_tokenize(text)
- pos_tags = [pos for word, pos in nltk.pos_tag(words)]
- if n > 1:
- tokens = ["_".join(t) for t in list(nltk.ngrams(pos_tags, n))]
- else:
- tokens = pos_tags
- total = len(tokens)
-
elif feats == "met_line":
tokens = text.split()
if n > 1:
@@ -73,7 +59,7 @@ def count_features(text, feats ="words", n = 1):
#Adding an error message in case some distracted guy like me would enter something wrong:
else:
- raise ValueError("Unsupported feature type. Choose from 'words', 'chars', 'affixes', 'met_line', 'met_syll' or 'pos'.")
+ raise ValueError("Unsupported feature type. Choose from 'words', 'chars', 'affixes', 'met_line', 'met_syll', 'lemmas' or 'pos'.")
counts = Counter()
counts.update(tokens)
@@ -108,7 +94,7 @@ def get_feature_list(myTexts, feats="words", n=1, freqsType="relative"):
"""
:param myTexts: a 'myTexts' object, containing documents to be processed
:param feat_list: a list of features to be selected
- :param feats: type of feats (words, chars, affixes, POS, met_line, or met_syll)
+ :param feats: the type of features, one of 'words', 'chars', 'affixes, 'lemma', 'pos', 'met_line' and 'met_syll'.
:param freqsType: "relative", "absolute" or "binary" frequencies
:param n: n-grams length
:return: list of features, with total frequency
@@ -142,14 +128,12 @@ def get_doc_frequency(myTexts):
return feats_doc_freq
-
-
def get_counts(myTexts, feat_list=None, feats = "words", n = 1, freqsType = "relative"):
"""
Get counts for a collection of texts
:param myTexts: the document collection
:param feat_list: a list of features to be selected (None for all)
- :param feats: the type of feats (words, chars, affixes, POS)
+ :param feats: the type of features, one of 'words', 'chars', 'affixes, 'lemma', 'pos', 'met_line' and 'met_syll'.
:param n: the length of n-grams
:param freqsType: relative, absolute or binarised freqs
:return: the collection with, for each text, a 'wordCounts' dictionary
diff --git a/superstyl/preproc/pipe.py b/superstyl/preproc/pipe.py
index c86949da..7c675b68 100755
--- a/superstyl/preproc/pipe.py
+++ b/superstyl/preproc/pipe.py
@@ -1,5 +1,3 @@
-import unicodedata
-
from lxml import etree
import regex as re
import unidecode
@@ -52,6 +50,153 @@ def XML_to_text(path):
return aut, re.sub(r"\s+", " ", str(myxsl(my_doc)))
+def txm_to_units(path, units="lines", feats="words"):
+ """
+ Extract units from TXM file
+ :param path: path to TXM file
+ :param units: units to extract ("lines"/"verses" or "words")
+ :param feats: features to extract ("words", "lemma", or "pos")
+ :return: list of extracted units
+ """
+ myxsl = etree.XML('''
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+''')
+ myxsl = etree.XSLT(myxsl)
+
+ with open(path, 'r') as f:
+ my_doc = etree.parse(f)
+
+ units_tokens = str(myxsl(my_doc, units=etree.XSLT.strparam(units), feats=etree.XSLT.strparam(feats))).splitlines()
+ return units_tokens
+
+def tei_to_units(path, feats="words", units="lines"):
+
+ if feats in ["met_syll", "met_line"]:
+ feats = "met"
+ myxsl = etree.XML('''
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ''')
+ myxsl = etree.XSLT(myxsl)
+
+ with open(path, 'r') as f:
+ my_doc = etree.parse(f)
+
+ units_tokens = str(myxsl(my_doc, units=etree.XSLT.strparam(units), feats=etree.XSLT.strparam(feats))).splitlines()
+ return units_tokens
+
+def specialXML_to_text(path, format="tei", feats="words"):
+ aut = path.split('/')[-1].split("_")[0]
+ if format=="tei":
+ units_tokens = tei_to_units(path, feats=feats, units="words")
+
+ if format=="txm":
+ units_tokens = txm_to_units(path, feats=feats, units="words")
+
+ return aut, re.sub(r"\s+", " ", str(' '.join(units_tokens)))
+
def TXT_to_text(path):
"""
Get main text from xml file
@@ -147,7 +292,7 @@ def load_texts(paths, identify_lang=False, feats="words", format="txt", keep_pun
Loads a collection of documents into a 'myTexts' object for further processing.
TODO: a proper class
:param paths: path to docs
- TODO: add feats!
+ :param feats: the type of features, one of 'words', 'chars', 'affixes, 'lemma', 'pos', 'met_line' and 'met_syll'.
:param identify_lang: whether or not try to identify lang (default: False)
:param format: format of the source files (implemented values: txt [default], xml)
:param keep_punct: whether or not to keep punctuation and caps.
@@ -165,6 +310,9 @@ def load_texts(paths, identify_lang=False, feats="words", format="txt", keep_pun
if format=='xml':
aut, text = XML_to_text(path)
+ if format in ('tei', 'txm'):
+ aut, text = specialXML_to_text(path, format=format, feats=feats)
+
else:
aut, text = TXT_to_text(path)
@@ -198,7 +346,7 @@ def get_samples(path, size, step=None, samples_random=False, max_samples=10,
:param max_samples: maximum number of samples per author/clas
:param units: the units to use, one of "words" or "verses"
:param format: type of document, one of full text, TEI or simple XML (ONLY TEI and TXT IMPLEMENTED)
- :param feats: the type of features, TODO: document
+ :param feats: the type of features, one of 'words', 'chars', 'affixes, 'lemma', 'pos', 'met_line' and 'met_syll'.
"""
if samples_random and step is not None:
@@ -213,127 +361,26 @@ def get_samples(path, size, step=None, samples_random=False, max_samples=10,
if units == "words" and format == "txt":
my_doc = TXT_to_text(path)
text = normalise(my_doc[1], keep_punct=keep_punct, keep_sym=keep_sym, no_ascii=no_ascii)
- units = nltk.tokenize.wordpunct_tokenize(text)
+ units_tokens = nltk.tokenize.wordpunct_tokenize(text)
- #TODO: DOCUMENT this format as TXM, and keep it only for retrocompatibility
+ #Kept only for retrocompatibility with Psysché
if units == "verses" and format == "txm":
- myxsl = etree.XML('''
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- ''')
- myxsl = etree.XSLT(myxsl)
-
- with open(path, 'r') as f:
- my_doc = etree.parse(f)
-
- units = str(myxsl(my_doc)).splitlines()
-
- # and now generating output
- samples = []
+ units_tokens = txm_to_units(path, units=units)
if format == "tei":
- if feats in ["met_syll", "met_line"]:
- feats = "met"
- myxsl = etree.XML('''
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-''')
- myxsl = etree.XSLT(myxsl)
-
- with open(path, 'r') as f:
- my_doc = etree.parse(f)
-
- units = str(myxsl(my_doc, units=etree.XSLT.strparam(units), feats=etree.XSLT.strparam(feats))).splitlines()
+ units_tokens = tei_to_units(path, units=units, feats=feats)
# and now generating output
samples = []
if samples_random:
for k in range(max_samples):
- samples.append({"start": str(k)+'s', "end": str(k)+'e', "text": list(random.choices(units, k=size))})
+ samples.append({"start": str(k)+'s', "end": str(k)+'e', "text": list(random.choices(units_tokens, k=size))})
else:
current = 0
- while current + size <= len(units):
- samples.append({"start": current, "end": current + size, "text": list(units[current:(current + size)])})
+ while current + size <= len(units_tokens):
+ samples.append({"start": current, "end": current + size, "text": list(units_tokens[current:(current + size)])})
current = current + step
return samples
@@ -353,7 +400,7 @@ def docs_to_samples(paths, size, step=None, units="words", samples_random=False,
:param keep_punct: whether to keep punctuation and caps.
:param max_samples: maximum number of samples per author/class.
:param identify_lang: whether to try to identify lang (default: False)
- :param feats: TODO
+ :param feats: the type of features, one of 'words', 'chars', 'affixes, 'lemma', 'pos', 'met_line' and 'met_syll'.
:return: a myTexts object
"""
myTexts = []
diff --git a/superstyl/svm.py b/superstyl/svm.py
index 204987a5..abceb823 100755
--- a/superstyl/svm.py
+++ b/superstyl/svm.py
@@ -288,13 +288,13 @@ def plot_coefficients(coefs, feature_names, current_class, top_features=10):
-def plot_rolling(final_predictions, smoothing=3):
+def plot_rolling(final_predictions, smoothing=3, xlab = "Index (segment center)"):
"""
Plots the rolling stylometry results as lines of decision function values over the text.
Parameters:
- final_predictions_path : str
- Path to the CSV file containing final predictions generated by the SVM pipeline.
+ final_predictions : Pandas dataframe containing the final predictions out of train_svm
+ .
smoothing : int or None
The window size for smoothing the curves.
@@ -304,24 +304,25 @@ def plot_rolling(final_predictions, smoothing=3):
"""
# Extract the segment center from the filename
+ my_final_predictions = final_predictions.copy() # to avoid modifying in place
segment_centers = []
- for fname in final_predictions['filename']:
+ for fname in my_final_predictions['filename']:
parts = fname.split('_')[-1].split('-')
start = int(parts[0])
end = int(parts[1])
center = (start + end) / 2.0
segment_centers.append(center)
- final_predictions['segment_center'] = segment_centers
+ my_final_predictions['segment_center'] = segment_centers
- final_predictions['filename'] = [fname.split('_')[1] for fname in final_predictions['filename']]
+ my_final_predictions['filename'] = [fname.split('_')[1] for fname in my_final_predictions['filename']]
# Identify candidate columns
known_cols = {'filename', 'author', 'segment_center'}
- candidate_cols = [c for c in final_predictions.columns if c not in known_cols]
+ candidate_cols = [c for c in my_final_predictions.columns if c not in known_cols]
- for work in final_predictions['filename'].unique():
- fpreds_work = final_predictions[final_predictions['filename'] == work]
+ for work in my_final_predictions['filename'].unique():
+ fpreds_work = my_final_predictions[my_final_predictions['filename'] == work]
# Sort by segment center to ensure chronological order
fpreds_work = fpreds_work.sort_values('segment_center')
@@ -336,7 +337,7 @@ def plot_rolling(final_predictions, smoothing=3):
plt.plot(fpreds_work['segment_center'], fpreds_work[col], label=col, linewidth=2)
plt.title('Rolling Stylometry Decision Functions Over ' + work)
- plt.xlabel('Word index (segment center)')
+ plt.xlabel(xlab)
plt.ylabel('Decision Function Value')
plt.ylim(min(-2, min(fpreds_work[candidate_cols].min()) - 0.2),
max(1, max(fpreds_work[candidate_cols].max())) + 0.2)
diff --git a/tests/test_error_handling.py b/tests/test_error_handling.py
new file mode 100644
index 00000000..cc37116a
--- /dev/null
+++ b/tests/test_error_handling.py
@@ -0,0 +1,320 @@
+import unittest
+import superstyl.load
+import superstyl.preproc.features_extract
+from superstyl.load_from_config import load_corpus_from_config
+import os
+import tempfile
+import json
+import glob
+
+THIS_DIR = os.path.dirname(os.path.abspath(__file__))
+
+
+class ErrorHandlingTests(unittest.TestCase):
+ """Tests for error handling and ValueError raising"""
+
+ def setUp(self):
+ """Set up test files paths"""
+ self.test_paths = sorted(glob.glob(os.path.join(THIS_DIR, "testdata/*.txt")))
+ self.temp_dir = tempfile.TemporaryDirectory()
+
+ def tearDown(self):
+ """Clean up temporary directory"""
+ self.temp_dir.cleanup()
+
+ # =========================================================================
+ # Tests pour load.py - ValueError pour formats incompatibles
+ # =========================================================================
+
+ def test_load_corpus_lemma_requires_tei(self):
+ # SCENARIO: lemma features require TEI format
+ # GIVEN: Attempting to use lemma with non-TEI format
+
+ # WHEN/THEN: Should raise ValueError
+ with self.assertRaises(ValueError) as context:
+ superstyl.load.load_corpus(
+ self.test_paths,
+ feats="lemma",
+ format="txt"
+ )
+
+ self.assertIn("lemma", str(context.exception))
+ self.assertIn("tei", str(context.exception).lower())
+
+ def test_load_corpus_pos_requires_tei(self):
+ # SCENARIO: pos features require TEI format
+ # GIVEN: Attempting to use pos with non-TEI format
+
+ # WHEN/THEN: Should raise ValueError
+ with self.assertRaises(ValueError) as context:
+ superstyl.load.load_corpus(
+ self.test_paths,
+ feats="pos",
+ format="txt"
+ )
+
+ self.assertIn("pos", str(context.exception))
+ self.assertIn("tei", str(context.exception).lower())
+
+ def test_load_corpus_met_line_requires_tei(self):
+ # SCENARIO: met_line features require TEI format
+ # GIVEN: Attempting to use met_line with non-TEI format
+
+ # WHEN/THEN: Should raise ValueError
+ with self.assertRaises(ValueError) as context:
+ superstyl.load.load_corpus(
+ self.test_paths,
+ feats="met_line",
+ format="txt"
+ )
+
+ self.assertIn("met_line", str(context.exception))
+ self.assertIn("tei", str(context.exception).lower())
+
+ def test_load_corpus_met_syll_requires_tei(self):
+ # SCENARIO: met_syll features require TEI format
+ # GIVEN: Attempting to use met_syll with non-TEI format
+
+ # WHEN/THEN: Should raise ValueError
+ with self.assertRaises(ValueError) as context:
+ superstyl.load.load_corpus(
+ self.test_paths,
+ feats="met_syll",
+ format="txt"
+ )
+
+ self.assertIn("met_syll", str(context.exception))
+ self.assertIn("tei", str(context.exception).lower())
+
+ def test_load_corpus_met_line_requires_lines_unit(self):
+ # SCENARIO: met_line requires units='lines'
+ # GIVEN: Attempting to use met_line with units='words'
+
+ # Create a dummy TEI file for this test
+ tei_path = os.path.join(self.temp_dir.name, "test_met.xml")
+ with open(tei_path, 'w') as f:
+ f.write('test')
+
+ # WHEN/THEN: Should raise ValueError
+ with self.assertRaises(ValueError) as context:
+ superstyl.load.load_corpus(
+ [tei_path],
+ feats="met_line",
+ format="tei",
+ units="words" # Wrong unit type
+ )
+
+ self.assertIn("met_line", str(context.exception))
+ self.assertIn("lines", str(context.exception))
+
+ def test_load_corpus_met_syll_requires_lines_unit(self):
+ # SCENARIO: met_syll requires units='lines'
+ # GIVEN: Attempting to use met_syll with units='words'
+
+ # Create a dummy TEI file for this test
+ tei_path = os.path.join(self.temp_dir.name, "test_met2.xml")
+ with open(tei_path, 'w') as f:
+ f.write('test')
+
+ # WHEN/THEN: Should raise ValueError
+ with self.assertRaises(ValueError) as context:
+ superstyl.load.load_corpus(
+ [tei_path],
+ feats="met_syll",
+ format="tei",
+ units="words" # Wrong unit type
+ )
+
+ self.assertIn("met_syll", str(context.exception))
+ self.assertIn("lines", str(context.exception))
+
+ # =========================================================================
+ # Tests pour features_extract.py - ValueError pour paramètres invalides
+ # =========================================================================
+
+ def test_count_features_empty_text(self):
+ # SCENARIO: Empty text should raise ValueError
+ # GIVEN: An empty string as text
+
+ # WHEN/THEN: Should raise ValueError
+ with self.assertRaises(ValueError) as context:
+ superstyl.preproc.features_extract.count_features(
+ "", # Empty text
+ feats="words",
+ n=1
+ )
+
+ self.assertIn("empty", str(context.exception).lower())
+
+ def test_count_features_invalid_n_zero(self):
+ # SCENARIO: n must be positive
+ # GIVEN: n=0
+
+ # WHEN/THEN: Should raise ValueError
+ with self.assertRaises(ValueError) as context:
+ superstyl.preproc.features_extract.count_features(
+ "test text",
+ feats="words",
+ n=0 # Invalid n
+ )
+
+ self.assertIn("positive", str(context.exception).lower())
+
+ def test_count_features_invalid_n_negative(self):
+ # SCENARIO: n must be positive
+ # GIVEN: n=-1
+
+ # WHEN/THEN: Should raise ValueError
+ with self.assertRaises(ValueError) as context:
+ superstyl.preproc.features_extract.count_features(
+ "test text",
+ feats="words",
+ n=-1 # Invalid n
+ )
+
+ self.assertIn("positive", str(context.exception).lower())
+
+ def test_count_features_invalid_n_not_integer(self):
+ # SCENARIO: n must be an integer
+ # GIVEN: n=1.5 (float)
+
+ # WHEN/THEN: Should raise ValueError
+ with self.assertRaises(ValueError) as context:
+ superstyl.preproc.features_extract.count_features(
+ "test text",
+ feats="words",
+ n=1.5 # Not an integer
+ )
+
+ self.assertIn("integer", str(context.exception).lower())
+
+ def test_count_features_invalid_not_string(self):
+ # SCENARIO: text must be a string
+ # GIVEN: text is not a string (e.g., None)
+
+ # WHEN/THEN: Should raise ValueError
+ with self.assertRaises(ValueError) as context:
+ superstyl.preproc.features_extract.count_features(
+ None, # Not a string
+ feats="words",
+ n=1
+ )
+
+ self.assertIn("string", str(context.exception).lower())
+
+ def test_count_features_unsupported_feats_type(self):
+ # SCENARIO: feats must be a supported type
+ # GIVEN: An unsupported feats type
+
+ # WHEN/THEN: Should raise ValueError
+ with self.assertRaises(ValueError) as context:
+ superstyl.preproc.features_extract.count_features(
+ "test text",
+ feats="unsupported_type", # Invalid feats type
+ n=1
+ )
+
+ self.assertIn("Unsupported", str(context.exception))
+
+ def test_get_counts_invalid_frequency_type(self):
+ # SCENARIO: freqsType must be valid
+ # GIVEN: An unsupported frequency type
+
+ myTexts = [{"name": "test", "text": "test text"}]
+
+ # WHEN/THEN: Should raise ValueError
+ with self.assertRaises(ValueError) as context:
+ superstyl.preproc.features_extract.get_counts(
+ myTexts,
+ feats="words",
+ freqsType="invalid_type" # Invalid frequency type
+ )
+
+ self.assertIn("Unsupported frequency type", str(context.exception))
+
+ # =========================================================================
+ # Tests pour load_from_config.py - Branches non couvertes
+ # =========================================================================
+
+ def test_load_from_config_with_json_feature_list(self):
+ # SCENARIO: Load corpus with JSON feature list (ligne 119)
+ # GIVEN: A config with a JSON feature list
+
+ # Create a JSON feature list
+ feature_list = [["the", 0], ["is", 0]]
+ feature_list_path = os.path.join(self.temp_dir.name, "features.json")
+ with open(feature_list_path, 'w') as f:
+ json.dump(feature_list, f)
+
+ # Create config
+ config = {
+ "paths": self.test_paths,
+ "format": "txt",
+ "features": [
+ {
+ "name": "test_feature",
+ "type": "words",
+ "n": 1,
+ "feat_list": feature_list_path # JSON feature list
+ }
+ ]
+ }
+
+ config_path = os.path.join(self.temp_dir.name, "config.json")
+ with open(config_path, 'w') as f:
+ json.dump(config, f)
+
+ # WHEN: Loading corpus from config
+ corpus, features = load_corpus_from_config(config_path)
+
+ # THEN: Should load successfully with JSON feature list
+ self.assertIsNotNone(corpus)
+ self.assertIsNotNone(features)
+
+ def test_load_from_config_test_mode_uses_feat_list(self):
+ # SCENARIO: In test mode, use provided feat_list (ligne 156)
+ # GIVEN: A config with feat_list in test mode
+
+ # Create a JSON feature list
+ feature_list = [["the", 0], ["is", 0], ["text", 0]]
+ feature_list_path = os.path.join(self.temp_dir.name, "test_features.json")
+ with open(feature_list_path, 'w') as f:
+ json.dump(feature_list, f)
+
+ # Create config with multiple features (triggers is_test logic)
+ config = {
+ "paths": self.test_paths,
+ "format": "txt",
+ "features": [
+ {
+ "name": "feat1",
+ "type": "words",
+ "n": 1,
+ "feat_list": feature_list_path
+ },
+ {
+ "name": "feat2",
+ "type": "chars",
+ "n": 2,
+ "feat_list": feature_list_path
+ }
+ ]
+ }
+
+ config_path = os.path.join(self.temp_dir.name, "multi_config.json")
+ with open(config_path, 'w') as f:
+ json.dump(config, f)
+
+ # WHEN: Loading corpus from config
+ corpus, features = load_corpus_from_config(config_path, is_test=True)
+
+ # THEN: Should use the provided feature list
+ self.assertIsNotNone(corpus)
+ self.assertIsNotNone(features)
+ # features should be a list of feature lists
+ self.assertIsInstance(features, list)
+ self.assertEqual(len(features), 2) # Two feature sets
+
+
+if __name__ == '__main__':
+ unittest.main()
\ No newline at end of file
diff --git a/tests/test_load_corpus.py b/tests/test_load_corpus.py
index 5de65d26..076aa199 100644
--- a/tests/test_load_corpus.py
+++ b/tests/test_load_corpus.py
@@ -286,22 +286,7 @@ def test_load_corpus(self):
self.assertEqual(sorted(feats), sorted(expected_feats))
self.assertEqual(corpus.to_dict(), expected_corpus)
- # WHEN
- corpus, feats = superstyl.load.load_corpus(sorted(self.paths[1:]), feats="pos", n=1, format="txt", freqsType="absolute")
-
- # THEN
- expected_feats = [('DT', 4), ('NN', 2), ('VBZ', 2), ('RB', 1)]
- expected_corpus = {
- 'author': {'Smith_Letter1.txt': 'Smith', 'Smith_Letter2.txt': 'Smith'},
- 'lang': {'Smith_Letter1.txt': 'NA', 'Smith_Letter2.txt': 'NA'},
- 'DT': {'Smith_Letter1.txt': 2 , 'Smith_Letter2.txt': 2},
- 'NN': {'Smith_Letter1.txt': 1 , 'Smith_Letter2.txt': 1},
- 'VBZ': {'Smith_Letter1.txt': 1, 'Smith_Letter2.txt': 1},
- 'RB': {'Smith_Letter1.txt': 0, 'Smith_Letter2.txt': 1}
- }
-
- self.assertEqual(sorted(feats), sorted(expected_feats))
- self.assertEqual(corpus.to_dict(), expected_corpus)
+ # TODO: add tests for lemma, pos, met_line, met_syll, and loading from tei, and from txm
# Now, test embedding
# WHEN
diff --git a/tests/test_xml_loading.py b/tests/test_xml_loading.py
new file mode 100644
index 00000000..c874bbb8
--- /dev/null
+++ b/tests/test_xml_loading.py
@@ -0,0 +1,304 @@
+import unittest
+import superstyl.preproc.pipe
+import os
+import glob
+
+THIS_DIR = os.path.dirname(os.path.abspath(__file__))
+
+
+class XMLLoadingTests(unittest.TestCase):
+ """Tests for XML, TEI, and TXM file loading functions"""
+
+ def setUp(self):
+ """Set up test files paths"""
+ self.xml_path = os.path.join(THIS_DIR, "testdata", "Smith_Song1.xml")
+ self.tei_path = os.path.join(THIS_DIR, "testdata", "Dupont_TEIPoem1.xml")
+ self.txm_path = os.path.join(THIS_DIR, "testdata", "Smith_TXM1.xml")
+
+ def test_XML_to_text(self):
+ # SCENARIO: Load text from a simple XML file
+ # GIVEN: An XML file with author and text elements
+
+ # WHEN: Loading the XML file
+ aut, text = superstyl.preproc.pipe.XML_to_text(self.xml_path)
+
+ # THEN: Author and text are correctly extracted
+ self.assertEqual(aut, "Smith")
+ self.assertIn("test song", text)
+ self.assertIn("lyrics", text)
+ # Check that whitespace is normalized
+ self.assertNotIn(" ", text)
+
+ def test_tei_to_units_words(self):
+ # SCENARIO: Extract words from a TEI file
+ # GIVEN: A TEI file with annotated words
+
+ # WHEN: Extracting words as units
+ units_tokens = superstyl.preproc.pipe.tei_to_units(
+ self.tei_path,
+ feats="words",
+ units="words"
+ )
+
+ # THEN: Words are extracted, one per line
+ self.assertIsInstance(units_tokens, list)
+ self.assertGreater(len(units_tokens), 0)
+ # Each word should be on a separate line
+ self.assertIn("This", [u.strip() for u in units_tokens])
+ self.assertIn("is", [u.strip() for u in units_tokens])
+
+ def test_tei_to_units_verses(self):
+ # SCENARIO: Extract verses (lines) from a TEI file
+ # GIVEN: A TEI file with verse lines
+
+ # WHEN: Extracting verses as units
+ units_tokens = superstyl.preproc.pipe.tei_to_units(
+ self.tei_path,
+ feats="words",
+ units="verses"
+ )
+
+ # THEN: Each verse is on a separate line
+ self.assertIsInstance(units_tokens, list)
+ # We should have 2 lines in our test file
+ self.assertEqual(len(units_tokens), 2)
+
+ def test_tei_to_units_lemma(self):
+ # SCENARIO: Extract lemmas from a TEI file
+ # GIVEN: A TEI file with lemma annotations
+
+ # WHEN: Extracting lemmas
+ units_tokens = superstyl.preproc.pipe.tei_to_units(
+ self.tei_path,
+ feats="lemma",
+ units="words"
+ )
+
+ # THEN: Lemmas are extracted
+ self.assertIsInstance(units_tokens, list)
+ self.assertIn("this", [u.strip() for u in units_tokens])
+ self.assertIn("be", [u.strip() for u in units_tokens])
+
+ def test_tei_to_units_pos(self):
+ # SCENARIO: Extract POS tags from a TEI file
+ # GIVEN: A TEI file with POS annotations
+
+ # WHEN: Extracting POS tags
+ units_tokens = superstyl.preproc.pipe.tei_to_units(
+ self.tei_path,
+ feats="pos",
+ units="words"
+ )
+
+ # THEN: POS tags are extracted
+ self.assertIsInstance(units_tokens, list)
+ self.assertIn("DET", [u.strip() for u in units_tokens])
+ self.assertIn("VERB", [u.strip() for u in units_tokens])
+
+ def test_tei_to_units_met_syll(self):
+ # SCENARIO: Extract metrical syllables from a TEI file
+ # GIVEN: A TEI file with metrical annotations
+
+ # WHEN: Extracting metrical syllables with met_syll feature
+ units_tokens = superstyl.preproc.pipe.tei_to_units(
+ self.tei_path,
+ feats="met_syll",
+ units="verses"
+ )
+
+ # THEN: Metrical annotations are extracted
+ self.assertIsInstance(units_tokens, list)
+ # The @met attributes should be present
+ self.assertGreater(len(units_tokens), 0)
+
+ def test_tei_to_units_met_line(self):
+ # SCENARIO: Extract metrical lines from a TEI file
+ # GIVEN: A TEI file with metrical annotations on lines
+
+ # WHEN: Extracting metrical patterns at line level
+ units_tokens = superstyl.preproc.pipe.tei_to_units(
+ self.tei_path,
+ feats="met_line",
+ units="verses"
+ )
+
+ # THEN: Metrical patterns for each line are extracted
+ self.assertIsInstance(units_tokens, list)
+ self.assertEqual(len(units_tokens), 2)
+ # Should contain the metrical patterns
+ self.assertIn("01010101", units_tokens[0])
+ self.assertIn("10101010", units_tokens[1])
+
+ def test_txm_to_units_words(self):
+ # SCENARIO: Extract words from a TXM file
+ # GIVEN: A TXM file with annotated words
+
+ # WHEN: Extracting words as units
+ units_tokens = superstyl.preproc.pipe.txm_to_units(
+ self.txm_path,
+ units="words"
+ )
+
+ # THEN: Words are extracted
+ # Note: When extracting individual words (units='words'),
+ # the NOMpro filter is not applied
+ self.assertIsInstance(units_tokens, list)
+ self.assertGreater(len(units_tokens), 0)
+ text_content = ' '.join(units_tokens)
+ # All words should be present including those with NOMpro
+ self.assertIn("This", text_content)
+ self.assertIn("test", text_content)
+
+ def test_txm_to_units_verses(self):
+ # SCENARIO: Extract verses from a TXM file
+ # GIVEN: A TXM file with verse lines
+
+ # WHEN: Extracting verses as units
+ units_tokens = superstyl.preproc.pipe.txm_to_units(
+ self.txm_path,
+ units="verses"
+ )
+
+ # THEN: Each verse is extracted and NOMpro words are filtered out
+ self.assertIsInstance(units_tokens, list)
+ self.assertEqual(len(units_tokens), 2)
+ # Check that NOMpro words are excluded in verse mode
+ text_content = ' '.join(units_tokens)
+ self.assertNotIn("here", text_content) # "here" has NOMpro tag and should be filtered
+ self.assertIn("This", text_content) # Regular words should be present
+
+ def test_txm_to_units_lemma(self):
+ # SCENARIO: Extract lemmas from a TXM file
+ # GIVEN: A TXM file with lemma annotations
+
+ # WHEN: Extracting lemmas
+ units_tokens = superstyl.preproc.pipe.txm_to_units(
+ self.txm_path,
+ units="words",
+ feats="lemma"
+ )
+
+ # THEN: Lemmas are extracted
+ self.assertIsInstance(units_tokens, list)
+ self.assertIn("be", [u.strip() for u in units_tokens]) # lemma of "is"
+ self.assertIn("this", [u.strip() for u in units_tokens])
+
+ def test_txm_to_units_pos(self):
+ # SCENARIO: Extract POS tags from a TXM file
+ # GIVEN: A TXM file with POS annotations
+
+ # WHEN: Extracting POS tags
+ units_tokens = superstyl.preproc.pipe.txm_to_units(
+ self.txm_path,
+ units="words",
+ feats="pos"
+ )
+
+ # THEN: POS tags are extracted
+ self.assertIsInstance(units_tokens, list)
+ self.assertIn("DET", [u.strip() for u in units_tokens])
+ self.assertIn("VERB", [u.strip() for u in units_tokens])
+
+ def test_specialXML_to_text_tei(self):
+ # SCENARIO: Load text from a TEI file using specialXML_to_text
+ # GIVEN: A TEI format file
+
+ # WHEN: Loading with format="tei"
+ aut, text = superstyl.preproc.pipe.specialXML_to_text(
+ self.tei_path,
+ format="tei",
+ feats="words"
+ )
+
+ # THEN: Author is extracted from filename and text is normalized
+ self.assertEqual(aut, "Dupont")
+ self.assertIsInstance(text, str)
+ self.assertGreater(len(text), 0)
+ # Check that whitespace is normalized (single spaces)
+ self.assertNotIn(" ", text)
+
+ def test_specialXML_to_text_txm(self):
+ # SCENARIO: Load text from a TXM file using specialXML_to_text
+ # GIVEN: A TXM format file
+
+ # WHEN: Loading with format="txm"
+ aut, text = superstyl.preproc.pipe.specialXML_to_text(
+ self.txm_path,
+ format="txm",
+ feats="words"
+ )
+
+ # THEN: Author is extracted from filename and text is normalized
+ self.assertEqual(aut, "Smith")
+ self.assertIsInstance(text, str)
+ self.assertGreater(len(text), 0)
+ # Text should contain words from the TXM file
+ self.assertIn("test", text.lower())
+
+ def test_specialXML_to_text_with_lemma(self):
+ # SCENARIO: Load lemmas from a TEI file
+ # GIVEN: A TEI file with lemma annotations
+
+ # WHEN: Loading with feats="lemma"
+ aut, text = superstyl.preproc.pipe.specialXML_to_text(
+ self.tei_path,
+ format="tei",
+ feats="lemma"
+ )
+
+ # THEN: Lemmas are in the text
+ self.assertEqual(aut, "Dupont")
+ self.assertIn("be", text) # lemma of "is"
+ self.assertIn("this", text)
+
+ def test_specialXML_to_text_with_pos(self):
+ # SCENARIO: Load POS tags from a TEI file
+ # GIVEN: A TEI file with POS annotations
+
+ # WHEN: Loading with feats="pos"
+ aut, text = superstyl.preproc.pipe.specialXML_to_text(
+ self.tei_path,
+ format="tei",
+ feats="pos"
+ )
+
+ # THEN: POS tags are in the text
+ self.assertEqual(aut, "Dupont")
+ self.assertIn("DET", text)
+ self.assertIn("VERB", text)
+
+ def test_specialXML_to_text_txm_with_lemma(self):
+ # SCENARIO: Load lemmas from a TXM file
+ # GIVEN: A TXM file with lemma annotations
+
+ # WHEN: Loading with feats="lemma"
+ aut, text = superstyl.preproc.pipe.specialXML_to_text(
+ self.txm_path,
+ format="txm",
+ feats="lemma"
+ )
+
+ # THEN: Lemmas are in the text
+ self.assertEqual(aut, "Smith")
+ self.assertIn("be", text) # lemma of "is"
+ self.assertIn("this", text)
+
+ def test_specialXML_to_text_txm_with_pos(self):
+ # SCENARIO: Load POS tags from a TXM file
+ # GIVEN: A TXM file with POS annotations
+
+ # WHEN: Loading with feats="pos"
+ aut, text = superstyl.preproc.pipe.specialXML_to_text(
+ self.txm_path,
+ format="txm",
+ feats="pos"
+ )
+
+ # THEN: POS tags are in the text
+ self.assertEqual(aut, "Smith")
+ self.assertIn("DET", text)
+ self.assertIn("VERB", text)
+
+
+if __name__ == '__main__':
+ unittest.main()
\ No newline at end of file
diff --git a/tests/testdata/Dupont_TEIPoem1.xml b/tests/testdata/Dupont_TEIPoem1.xml
new file mode 100644
index 00000000..a737babd
--- /dev/null
+++ b/tests/testdata/Dupont_TEIPoem1.xml
@@ -0,0 +1,30 @@
+
+
+
+
+
+ Test Poem
+
+
+
+
+
+
+
+ This
+ is
+ the
+ first
+ line
+
+
+ And
+ this
+ is
+ the
+ second
+
+
+
+
+
\ No newline at end of file
diff --git a/tests/testdata/Smith_Song1.xml b/tests/testdata/Smith_Song1.xml
new file mode 100644
index 00000000..d713cc96
--- /dev/null
+++ b/tests/testdata/Smith_Song1.xml
@@ -0,0 +1,5 @@
+
+
+ Smith
+ This is a test song with some lyrics
+
\ No newline at end of file
diff --git a/tests/testdata/Smith_TXM1.xml b/tests/testdata/Smith_TXM1.xml
new file mode 100644
index 00000000..0411bf57
--- /dev/null
+++ b/tests/testdata/Smith_TXM1.xml
@@ -0,0 +1,55 @@
+
+
+
+
+
+ Test TXM Text
+
+
+
+
+
+
+
+
+ This
+ this
+ DET
+
+
+ is
+ be
+ VERB
+
+
+ a
+ a
+ DET
+
+
+ test
+ test
+ NOUN
+
+
+
+
+ Second
+ second
+ ADJ
+
+
+ line
+ line
+ NOUN
+
+
+ here
+ here
+ NOMpro
+
+
+
+
+
+
\ No newline at end of file