SupervisedStylometry · TheoMoins · Nov 27, 2025 · May 23, 2025 · Jun 4, 2025 · Nov 20, 2025
diff --git a/.coverage b/.coverage
diff --git a/load_corpus.py b/load_corpus.py
@@ -18,18 +18,19 @@
     parser.add_argument('-f', action="store", help="optional list of features, either in json (generated by"
                                                    " Superstyl) or simple txt (one word per line)", default=False)
     parser.add_argument('-t', action='store', help="types of features (words, chars, affixes - "
-                                                   "as per Sapkota et al. 2015 - or pos). pos are currently"
-                                                   "only implemented for Modern English", type=str,
-                        default="words", choices=["words", "chars", "affixes", "pos"])
+                                                   "as per Sapkota et al. 2015 -, as well as lemma or pos, met_line, "
+                                                   "met_syll (those four last only for TEI files with proper annotation)"
+                                                   , type=str,
+                        default="words", choices=["words", "chars", "affixes", "pos", "lemma", "met_line", "met_syll"])
     parser.add_argument('-n', action='store', help="n grams lengths (default 1)", default=1, type=int)
     parser.add_argument('-k', action='store', help="How many most frequent?", default=5000, type=int)
     parser.add_argument('--freqs', action='store', help="relative, absolute or binarised freqs",
                         default="relative",
                         choices=["relative", "absolute", "binary"]
                         )
-    parser.add_argument('-x', action='store', help="format (txt, xml or tei) WARNING: only txt is fully implemented",
+    parser.add_argument('-x', action='store', help="format (txt, xml, tei, or txm) WARNING: only txt is fully implemented",
                         default="txt",
-                        choices=["txt", "xml", "tei"]
+                        choices=["txt", "xml", "tei", 'txm']
                         )
     parser.add_argument('--sampling', action='store_true', help="Sample the texts?", default=False)
     parser.add_argument('--sample_units', action='store', help="Units of length for sampling "

diff --git a/superstyl/load.py b/superstyl/load.py
@@ -14,10 +14,11 @@ def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, freqsTyp
     Main function to load a corpus from a collection of file, and an optional list of features to extract.
     :param data_paths: paths to the source files
     :param feat_list: an optional list of features (as created by load_corpus), default None
-    :param feats: the type of features, one of 'words', 'chars', 'affixes, and 'POS'. Affixes are inspired by
-    Sapkota et al. 2015, and include space_prefix, space_suffix, prefix, suffix, and, if keep_pos, punctuation n-grams.
-    POS are currently only implemented for Modern English
-    TODO: add met_line, met_syll
+    :param feats: the type of features, one of 'words', 'chars', 'affixes, 'lemma', 'pos', 'met_line' and 'met_syll'.
+    Affixes are inspired by Sapkota et al. 2015, and include space_prefix, space_suffix, prefix, suffix, and,
+    if keep_punct, punctuation n-grams. From TEI, pos, lemma, met_line or met_syll can
+    be extracted; met_line is the prosodic (stress) annotation of a full verse; met_syll is a char n-gram of prosodic
+    annotation
     :param n: n grams lengths (default 1)
     :param k: How many most frequent? The function takes the rank of k (if k is smaller than the total number of features),
     gets its frequencies, and only include features of superior or equal total frequencies.
@@ -45,6 +46,12 @@ def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, freqsTyp
     :return a pandas dataFrame of text metadata and feature frequencies; a global list of features with their frequencies
     """
 
+    if feats in ('lemma', 'pos', 'met_line', 'met_syll') and format != 'tei':
+        raise ValueError("lemma, pos, met_line or met_syll are only possible with adequate tei format (@lemma, @pos, @met)")
+
+    if feats in ('met_line', 'met_syll') and units != 'lines':
+        raise ValueError("met_line or met_syll are only possible with tei format that includes lines and @met")
+
     embeddedFreqs = False
     if embedding:
         print(".......loading embedding.......")
@@ -81,7 +88,6 @@ def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, freqsTyp
 
     my_feats = [m[0] for m in feat_list] # keeping only the features without the frequencies
     myTexts = fex.get_counts(myTexts, feat_list=my_feats, feats=feats, n=n, freqsType=freqsType)
-
     if embedding:
         print(".......embedding counts.......")
         myTexts, my_feats = embed.get_embedded_counts(myTexts, my_feats, model, topn=neighbouring_size)

diff --git a/superstyl/load_from_config.py b/superstyl/load_from_config.py
@@ -1,12 +1,11 @@
 import json
-import superstyl
 import pandas as pd
 import os
 import glob
 
 from superstyl.load import load_corpus
 
-def load_corpus_from_config(config_path):
+def load_corpus_from_config(config_path, is_test=False):
     """
     Load a corpus based on a JSON configuration file.
 
@@ -55,7 +54,7 @@ def load_corpus_from_config(config_path):
 
     # Get sampling parameters
     sampling_params = config.get('sampling', {})
-    
+
     # Use the first feature to create the base corpus with sampling
     feature_configs = config.get('features', [])
     if not feature_configs:
@@ -87,9 +86,9 @@ def load_corpus_from_config(config_path):
             'sampling': sampling_params.get('enabled', False),
             'units': sampling_params.get('units', 'words'),
             'size': sampling_params.get('sample_size', 3000),
-            'step': sampling_params.get('sample_step', None),
+            'step': sampling_params.get('step', None),
             'max_samples': sampling_params.get('max_samples', None),
-            'samples_random': sampling_params.get('sample_random', False),
+            'samples_random': sampling_params.get('samples_random', False),
             'keep_punct': feature_config.get('keep_punct', False),
             'keep_sym': feature_config.get('keep_sym', False),
             'no_ascii': feature_config.get('no_ascii', False),
@@ -115,6 +114,7 @@ def load_corpus_from_config(config_path):
         # Check for feature list file
         feat_list = None
         feat_list_path = feature_config.get('feat_list')
+        print(feat_list_path)
         if feat_list_path:
             if feat_list_path.endswith('.json'):
                 with open(feat_list_path, 'r') as f:
@@ -133,9 +133,9 @@ def load_corpus_from_config(config_path):
             'sampling': sampling_params.get('enabled', False),
             'units': sampling_params.get('units', 'words'),
             'size': sampling_params.get('sample_size', 3000),
-            'step': sampling_params.get('sample_step', None),
+            'step': sampling_params.get('step', None),
             'max_samples': sampling_params.get('max_samples', None),
-            'samples_random': sampling_params.get('sample_random', False),
+            'samples_random': sampling_params.get('samples_random', False),
             'keep_punct': config.get('keep_punct', False),
             'keep_sym': config.get('keep_sym', False),
             'no_ascii': config.get('no_ascii', False),
@@ -146,11 +146,17 @@ def load_corpus_from_config(config_path):
         }
 
         print(f"Loading {feature_name}...")
+
         corpus, features = load_corpus(paths, feat_list=feat_list, **params)
 
         # Store corpus and features
         corpora[feature_name] = corpus
-        feature_lists[feature_name] = features
+
+        if feat_list is not None and is_test:
+            feature_lists[feature_name] = feat_list
+        else:
+            feature_lists[feature_name] = features
+
 
     # Create a merged dataset
     print("Creating merged dataset...")
@@ -170,6 +176,8 @@ def load_corpus_from_config(config_path):
 
     # Add features from each corpus
     for name, corpus in corpora.items():
+        single_feature = []
+
         feature_cols = [col for col in corpus.columns if col not in ['author', 'lang']]
 
         # Rename columns to avoid duplicates
@@ -181,8 +189,9 @@ def load_corpus_from_config(config_path):
 
         # Add features to the combined list with prefixes
         for feature in feature_lists[name]:
-            all_features.append((f"{name}_{feature[0]}", feature[1]))
+            single_feature.append((feature[0], feature[1]))
 
+        all_features.append(single_feature)
     # Return the merged corpus and combined feature list
     return merged, all_features
 
diff --git a/superstyl/preproc/features_extract.py b/superstyl/preproc/features_extract.py
@@ -10,7 +10,7 @@ def count_features(text, feats ="words", n = 1):
     Get feature counts from  a text (words, chars or POS n-grams, or affixes(+punct if keep_punct),
     following Sapkota et al., NAACL 2015
     :param text: the source text
-    :param feats: the type of feats: words, chars, POS (supported only for English), or affixes
+    :param feats: the type of features, one of 'words', 'chars', 'affixes, 'lemma', 'pos', 'met_line' and 'met_syll'.
     :param n: the length of n-grams
     :return: features absolute frequencies in text as a counter, and the total of frequencies
     """
@@ -20,9 +20,9 @@ def count_features(text, feats ="words", n = 1):
         raise ValueError("Text cannot be empty.")
     if n < 1 or not isinstance(n, int):
         raise ValueError("n must be a positive integer.")
-    if feats not in ["words", "chars", "affixes", "pos", "met_line", "met_syll"]:
-        raise ValueError("Unsupported feature type. Choose from 'words', 'chars', 'affixes', 'met_line', 'met_syll', or 'pos'.")
-    if feats == "words":
+    if feats not in ["words", "chars", "affixes", "lemma", "pos", "met_line", "met_syll"]:
+        raise ValueError("Unsupported feature type. Choose from 'words', 'chars', 'affixes', 'met_line', 'met_syll', 'lemma' or 'pos'.")
+    if feats in ("words", "lemma", "pos"):
         tokens = nltk.tokenize.wordpunct_tokenize(text)
         if n > 1:
             tokens = ["_".join(t) for t in list(nltk.ngrams(tokens, n))]
@@ -46,20 +46,6 @@ def count_features(text, feats ="words", n = 1):
                                 ]
         tokens = affs + space_affs_and_punct
 
-    #POS in english with NLTK - need to propose spacy later on
-    elif feats == "pos":
-        try:
-            nltk.data.find('taggers/averaged_perceptron_tagger_eng')
-        except:
-            nltk.download('averaged_perceptron_tagger_eng')
-        words = nltk.tokenize.wordpunct_tokenize(text)
-        pos_tags = [pos for word, pos in nltk.pos_tag(words)]
-        if n > 1:
-            tokens = ["_".join(t) for t in list(nltk.ngrams(pos_tags, n))]
-        else:
-            tokens = pos_tags
-        total = len(tokens)
-
     elif feats == "met_line":
         tokens = text.split()
         if n > 1:
@@ -73,7 +59,7 @@ def count_features(text, feats ="words", n = 1):
 
     #Adding an error message in case some distracted guy like me would enter something wrong:
     else:
-        raise ValueError("Unsupported feature type. Choose from 'words', 'chars', 'affixes', 'met_line', 'met_syll' or 'pos'.")
+        raise ValueError("Unsupported feature type. Choose from 'words', 'chars', 'affixes', 'met_line', 'met_syll', 'lemmas' or 'pos'.")
 
     counts = Counter()
     counts.update(tokens)
@@ -108,7 +94,7 @@ def get_feature_list(myTexts, feats="words", n=1, freqsType="relative"):
     """
     :param myTexts: a 'myTexts' object, containing documents to be processed
     :param feat_list: a list of features to be selected
-    :param feats: type of feats (words, chars, affixes, POS, met_line, or met_syll)
+    :param feats: the type of features, one of 'words', 'chars', 'affixes, 'lemma', 'pos', 'met_line' and 'met_syll'.
     :param freqsType: "relative", "absolute" or "binary" frequencies
     :param n: n-grams length
     :return: list of features, with total frequency
@@ -142,14 +128,12 @@ def get_doc_frequency(myTexts):
     return feats_doc_freq
 
 
-
-
 def get_counts(myTexts, feat_list=None, feats = "words", n = 1, freqsType = "relative"):
     """
     Get counts for a collection of texts
     :param myTexts: the document collection
     :param feat_list: a list of features to be selected (None for all)
-    :param feats: the type of feats (words, chars, affixes, POS)
+    :param feats: the type of features, one of 'words', 'chars', 'affixes, 'lemma', 'pos', 'met_line' and 'met_syll'.
     :param n: the length of n-grams
     :param freqsType: relative, absolute or binarised freqs
     :return: the collection with, for each text, a 'wordCounts' dictionary