From 9c23df9cd819cb881c6757fe5241735a8b17750f Mon Sep 17 00:00:00 2001
From: Theo <theo.moins@gmail.com>
Date: Fri, 23 May 2025 19:35:51 +0200
Subject: [PATCH 01/10] Adding a glob option to load_from_config

---
 superstyl/load_from_config.py | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/superstyl/load_from_config.py b/superstyl/load_from_config.py
index 4b66c445..103dd853 100644
--- a/superstyl/load_from_config.py
+++ b/superstyl/load_from_config.py
@@ -2,6 +2,7 @@
 import superstyl
 import pandas as pd
 import os
+import glob
 
 from superstyl.load import load_corpus
 
@@ -28,11 +29,25 @@ def load_corpus_from_config(config_path):
         config = json.load(f)
     
     # Get corpus paths
+
     if 'paths' in config:
         if isinstance(config['paths'], list):
-            paths = config['paths']
+            paths = []
+            for path in config['paths']:
+                if '*' in path or '?' in path or '[' in path:
+                    expanded_paths = glob.glob(path)
+                    if not expanded_paths:
+                        print(f"Warning: No files found for pattern '{path}'")
+                    paths.extend(expanded_paths)
+                else:
+                    paths.append(path)
         elif isinstance(config['paths'], str):
-            paths = [config['paths']]
+            if '*' in config['paths'] or '?' in config['paths'] or '[' in config['paths']:
+                paths = glob.glob(config['paths'])
+                if not paths:
+                    raise ValueError(f"No files found for glob pattern '{config['paths']}'")
+            else:
+                paths = [config['paths']]
         else:
             raise ValueError("Paths in config must be either a list or a glob pattern string")
     else:
@@ -83,7 +98,7 @@ def load_corpus_from_config(config_path):
             'neighbouring_size': feature_config.get('neighbouring_size', 10),
             'culling': feature_config.get('culling', 0)
         }
-        
+
         print(f"Loading corpus with {feature_name}...")
         corpus, features = load_corpus(paths, feat_list=feat_list, **params)
         

From 57caa7236958a82aad4ac2665f5ba746296d4534 Mon Sep 17 00:00:00 2001
From: Theo <theo.moins@gmail.com>
Date: Wed, 4 Jun 2025 19:25:27 +0200
Subject: [PATCH 02/10] Add file to debug

---
 superstyl/load.py             |  1 -
 superstyl/load_from_config.py | 26 ++++++++++++++++++--------
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/superstyl/load.py b/superstyl/load.py
index 4528c6f6..edc8232e 100644
--- a/superstyl/load.py
+++ b/superstyl/load.py
@@ -81,7 +81,6 @@ def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, freqsTyp
 
     my_feats = [m[0] for m in feat_list] # keeping only the features without the frequencies
     myTexts = fex.get_counts(myTexts, feat_list=my_feats, feats=feats, n=n, freqsType=freqsType)
-
     if embedding:
         print(".......embedding counts.......")
         myTexts, my_feats = embed.get_embedded_counts(myTexts, my_feats, model, topn=neighbouring_size)
diff --git a/superstyl/load_from_config.py b/superstyl/load_from_config.py
index 103dd853..9bf433fb 100644
--- a/superstyl/load_from_config.py
+++ b/superstyl/load_from_config.py
@@ -55,7 +55,7 @@ def load_corpus_from_config(config_path):
     
     # Get sampling parameters
     sampling_params = config.get('sampling', {})
-    
+
     # Use the first feature to create the base corpus with sampling
     feature_configs = config.get('features', [])
     if not feature_configs:
@@ -89,7 +89,7 @@ def load_corpus_from_config(config_path):
             'size': sampling_params.get('sample_size', 3000),
             'step': sampling_params.get('sample_step', None),
             'max_samples': sampling_params.get('max_samples', None),
-            'samples_random': sampling_params.get('sample_random', False),
+            'samples_random': sampling_params.get('samples_random', False),
             'keep_punct': feature_config.get('keep_punct', False),
             'keep_sym': feature_config.get('keep_sym', False),
             'no_ascii': feature_config.get('no_ascii', False),
@@ -115,6 +115,7 @@ def load_corpus_from_config(config_path):
         # Check for feature list file
         feat_list = None
         feat_list_path = feature_config.get('feat_list')
+        print(feat_list_path)
         if feat_list_path:
             if feat_list_path.endswith('.json'):
                 with open(feat_list_path, 'r') as f:
@@ -135,7 +136,7 @@ def load_corpus_from_config(config_path):
             'size': sampling_params.get('sample_size', 3000),
             'step': sampling_params.get('sample_step', None),
             'max_samples': sampling_params.get('max_samples', None),
-            'samples_random': sampling_params.get('sample_random', False),
+            'samples_random': sampling_params.get('samples_random', False),
             'keep_punct': config.get('keep_punct', False),
             'keep_sym': config.get('keep_sym', False),
             'no_ascii': config.get('no_ascii', False),
@@ -146,11 +147,17 @@ def load_corpus_from_config(config_path):
         }
         
         print(f"Loading {feature_name}...")
+
         corpus, features = load_corpus(paths, feat_list=feat_list, **params)
         
         # Store corpus and features
         corpora[feature_name] = corpus
-        feature_lists[feature_name] = features
+
+        if feat_list is not None:
+            feature_lists[feature_name] = feat_list
+        else:
+            feature_lists[feature_name] = features
+        print(len(feature_lists[feature_name]))
     
     # Create a merged dataset
     print("Creating merged dataset...")
@@ -170,19 +177,22 @@ def load_corpus_from_config(config_path):
     
     # Add features from each corpus
     for name, corpus in corpora.items():
+        single_feature = []
+
         feature_cols = [col for col in corpus.columns if col not in ['author', 'lang']]
         
         # Rename columns to avoid duplicates
-        renamed_cols = {col: f"{name}_{col}" for col in feature_cols}
-        feature_df = corpus[feature_cols].rename(columns=renamed_cols)
+        #renamed_cols = {col: col for col in feature_cols}
+        feature_df = corpus[feature_cols]#.rename(columns=renamed_cols)
         
         # Merge with the main DataFrame
         merged = pd.concat([merged, feature_df], axis=1)
         
         # Add features to the combined list with prefixes
-        for feature in feature_lists[name]:
-            all_features.append((f"{name}_{feature[0]}", feature[1]))
+        for feature in corpus.columns:#feature_lists[name]:
+            single_feature.append((feature, 0))#[0], feature[1]))
     
+        all_features.append(single_feature)
     # Return the merged corpus and combined feature list
     return merged, all_features
 

From 984770952b5795bd944cb07bfc11285b73952d37 Mon Sep 17 00:00:00 2001
From: Theo <theo.moins@gmail.com>
Date: Thu, 20 Nov 2025 14:01:36 +0100
Subject: [PATCH 03/10] Change load from config

---
 superstyl/load_from_config.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/superstyl/load_from_config.py b/superstyl/load_from_config.py
index 9bf433fb..5dc7d3a6 100644
--- a/superstyl/load_from_config.py
+++ b/superstyl/load_from_config.py
@@ -6,7 +6,7 @@
 
 from superstyl.load import load_corpus
 
-def load_corpus_from_config(config_path):
+def load_corpus_from_config(config_path, is_test=False):
     """
     Load a corpus based on a JSON configuration file.
     
@@ -87,7 +87,7 @@ def load_corpus_from_config(config_path):
             'sampling': sampling_params.get('enabled', False),
             'units': sampling_params.get('units', 'words'),
             'size': sampling_params.get('sample_size', 3000),
-            'step': sampling_params.get('sample_step', None),
+            'step': sampling_params.get('step', None),
             'max_samples': sampling_params.get('max_samples', None),
             'samples_random': sampling_params.get('samples_random', False),
             'keep_punct': feature_config.get('keep_punct', False),
@@ -134,7 +134,7 @@ def load_corpus_from_config(config_path):
             'sampling': sampling_params.get('enabled', False),
             'units': sampling_params.get('units', 'words'),
             'size': sampling_params.get('sample_size', 3000),
-            'step': sampling_params.get('sample_step', None),
+            'step': sampling_params.get('step', None),
             'max_samples': sampling_params.get('max_samples', None),
             'samples_random': sampling_params.get('samples_random', False),
             'keep_punct': config.get('keep_punct', False),
@@ -153,11 +153,11 @@ def load_corpus_from_config(config_path):
         # Store corpus and features
         corpora[feature_name] = corpus
 
-        if feat_list is not None:
+        if feat_list is not None and is_test:
             feature_lists[feature_name] = feat_list
         else:
             feature_lists[feature_name] = features
-        print(len(feature_lists[feature_name]))
+        
     
     # Create a merged dataset
     print("Creating merged dataset...")
@@ -182,15 +182,15 @@ def load_corpus_from_config(config_path):
         feature_cols = [col for col in corpus.columns if col not in ['author', 'lang']]
         
         # Rename columns to avoid duplicates
-        #renamed_cols = {col: col for col in feature_cols}
-        feature_df = corpus[feature_cols]#.rename(columns=renamed_cols)
+        renamed_cols = {col: col for col in feature_cols}
+        feature_df = corpus[feature_cols].rename(columns=renamed_cols)
         
         # Merge with the main DataFrame
         merged = pd.concat([merged, feature_df], axis=1)
         
         # Add features to the combined list with prefixes
-        for feature in corpus.columns:#feature_lists[name]:
-            single_feature.append((feature, 0))#[0], feature[1]))
+        for feature in feature_lists[name]:
+            single_feature.append((feature[0], feature[1]))
     
         all_features.append(single_feature)
     # Return the merged corpus and combined feature list

From 6fc2f21fbaf8b88d8bf1911cd1aad8103620fe48 Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Camps <jbcamps@hotmail.com>
Date: Thu, 20 Nov 2025 14:04:16 +0100
Subject: [PATCH 04/10] correct Florians bug

---
 superstyl/svm.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/superstyl/svm.py b/superstyl/svm.py
index 204987a5..abceb823 100755
--- a/superstyl/svm.py
+++ b/superstyl/svm.py
@@ -288,13 +288,13 @@ def plot_coefficients(coefs, feature_names, current_class, top_features=10):
 
 
 
-def plot_rolling(final_predictions, smoothing=3):
+def plot_rolling(final_predictions, smoothing=3, xlab = "Index (segment center)"):
     """
     Plots the rolling stylometry results as lines of decision function values over the text.
     
     Parameters:
-    final_predictions_path : str
-        Path to the CSV file containing final predictions generated by the SVM pipeline.
+    final_predictions : Pandas dataframe containing the final predictions out of train_svm
+        .
     
     smoothing : int or None
         The window size for smoothing the curves.
@@ -304,24 +304,25 @@ def plot_rolling(final_predictions, smoothing=3):
     """
 
     # Extract the segment center from the filename
+    my_final_predictions = final_predictions.copy() # to avoid modifying in place
     segment_centers = []
-    for fname in final_predictions['filename']:
+    for fname in my_final_predictions['filename']:
         parts = fname.split('_')[-1].split('-')
         start = int(parts[0])
         end = int(parts[1])
         center = (start + end) / 2.0
         segment_centers.append(center)
 
-    final_predictions['segment_center'] = segment_centers
+    my_final_predictions['segment_center'] = segment_centers
 
-    final_predictions['filename'] = [fname.split('_')[1] for fname in final_predictions['filename']]
+    my_final_predictions['filename'] = [fname.split('_')[1] for fname in my_final_predictions['filename']]
     
     # Identify candidate columns
     known_cols = {'filename', 'author', 'segment_center'}
-    candidate_cols = [c for c in final_predictions.columns if c not in known_cols]
+    candidate_cols = [c for c in my_final_predictions.columns if c not in known_cols]
 
-    for work in final_predictions['filename'].unique():
-        fpreds_work = final_predictions[final_predictions['filename'] == work]
+    for work in my_final_predictions['filename'].unique():
+        fpreds_work = my_final_predictions[my_final_predictions['filename'] == work]
         # Sort by segment center to ensure chronological order
         fpreds_work = fpreds_work.sort_values('segment_center')
 
@@ -336,7 +337,7 @@ def plot_rolling(final_predictions, smoothing=3):
             plt.plot(fpreds_work['segment_center'], fpreds_work[col], label=col, linewidth=2)
 
         plt.title('Rolling Stylometry Decision Functions Over ' + work)
-        plt.xlabel('Word index (segment center)')
+        plt.xlabel(xlab)
         plt.ylabel('Decision Function Value')
         plt.ylim(min(-2, min(fpreds_work[candidate_cols].min()) - 0.2),
                  max(1, max(fpreds_work[candidate_cols].max())) + 0.2)

From 71461061b8cec5ed379a4b46e3b34d8288631540 Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Camps <jbcamps@hotmail.com>
Date: Thu, 20 Nov 2025 14:46:44 +0100
Subject: [PATCH 05/10] fixing this lemma issue (bye bye pos for english)

---
 superstyl/load.py                     |  15 +-
 superstyl/preproc/features_extract.py |  28 +--
 superstyl/preproc/pipe.py             | 251 ++++++++++++++------------
 3 files changed, 157 insertions(+), 137 deletions(-)

diff --git a/superstyl/load.py b/superstyl/load.py
index edc8232e..7c50a790 100644
--- a/superstyl/load.py
+++ b/superstyl/load.py
@@ -14,10 +14,11 @@ def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, freqsTyp
     Main function to load a corpus from a collection of file, and an optional list of features to extract.
     :param data_paths: paths to the source files
     :param feat_list: an optional list of features (as created by load_corpus), default None
-    :param feats: the type of features, one of 'words', 'chars', 'affixes, and 'POS'. Affixes are inspired by
-    Sapkota et al. 2015, and include space_prefix, space_suffix, prefix, suffix, and, if keep_pos, punctuation n-grams.
-    POS are currently only implemented for Modern English
-    TODO: add met_line, met_syll
+    :param feats: the type of features, one of 'words', 'chars', 'affixes, 'lemma', 'pos', 'met_line' and 'met_syll'.
+    Affixes are inspired by Sapkota et al. 2015, and include space_prefix, space_suffix, prefix, suffix, and,
+    if keep_punct, punctuation n-grams. From TEI, pos, lemma, met_line or met_syll can
+    be extracted; met_line is the prosodic (stress) annotation of a full verse; met_syll is a char n-gram of prosodic
+    annotation
     :param n: n grams lengths (default 1)
     :param k: How many most frequent? The function takes the rank of k (if k is smaller than the total number of features),
     gets its frequencies, and only include features of superior or equal total frequencies.
@@ -45,6 +46,12 @@ def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, freqsTyp
     :return a pandas dataFrame of text metadata and feature frequencies; a global list of features with their frequencies
     """
 
+    if feats in ('lemma', 'pos', 'met_line', 'met_syll') and format is not 'tei':
+        raise ValueError("lemma, pos, met_line or met_syll are only possible with adequate tei format (@lemma, @pos, @met)")
+
+    if feats in ('met_line', 'met_syll') and units is not 'lines':
+        raise ValueError("met_line or met_syll are only possible with tei format that includes lines and @met")
+
     embeddedFreqs = False
     if embedding:
         print(".......loading embedding.......")
diff --git a/superstyl/preproc/features_extract.py b/superstyl/preproc/features_extract.py
index fe6ba5c0..3ef055d3 100755
--- a/superstyl/preproc/features_extract.py
+++ b/superstyl/preproc/features_extract.py
@@ -20,9 +20,9 @@ def count_features(text, feats ="words", n = 1):
         raise ValueError("Text cannot be empty.")
     if n < 1 or not isinstance(n, int):
         raise ValueError("n must be a positive integer.")
-    if feats not in ["words", "chars", "affixes", "pos", "met_line", "met_syll"]:
-        raise ValueError("Unsupported feature type. Choose from 'words', 'chars', 'affixes', 'met_line', 'met_syll', or 'pos'.")
-    if feats == "words":
+    if feats not in ["words", "chars", "affixes", "lemmas", "pos", "met_line", "met_syll"]:
+        raise ValueError("Unsupported feature type. Choose from 'words', 'chars', 'affixes', 'met_line', 'met_syll', 'lemmas' or 'pos'.")
+    if feats in ("words", "lemmas", "pos"):
         tokens = nltk.tokenize.wordpunct_tokenize(text)
         if n > 1:
             tokens = ["_".join(t) for t in list(nltk.ngrams(tokens, n))]
@@ -46,20 +46,6 @@ def count_features(text, feats ="words", n = 1):
                                 ]
         tokens = affs + space_affs_and_punct
 
-    #POS in english with NLTK - need to propose spacy later on
-    elif feats == "pos":
-        try:
-            nltk.data.find('taggers/averaged_perceptron_tagger_eng')
-        except:
-            nltk.download('averaged_perceptron_tagger_eng')
-        words = nltk.tokenize.wordpunct_tokenize(text)
-        pos_tags = [pos for word, pos in nltk.pos_tag(words)]
-        if n > 1:
-            tokens = ["_".join(t) for t in list(nltk.ngrams(pos_tags, n))]
-        else:
-            tokens = pos_tags
-        total = len(tokens)
-
     elif feats == "met_line":
         tokens = text.split()
         if n > 1:
@@ -73,7 +59,7 @@ def count_features(text, feats ="words", n = 1):
 
     #Adding an error message in case some distracted guy like me would enter something wrong:
     else:
-        raise ValueError("Unsupported feature type. Choose from 'words', 'chars', 'affixes', 'met_line', 'met_syll' or 'pos'.")
+        raise ValueError("Unsupported feature type. Choose from 'words', 'chars', 'affixes', 'met_line', 'met_syll', 'lemmas' or 'pos'.")
 
     counts = Counter()
     counts.update(tokens)
@@ -108,7 +94,7 @@ def get_feature_list(myTexts, feats="words", n=1, freqsType="relative"):
     """
     :param myTexts: a 'myTexts' object, containing documents to be processed
     :param feat_list: a list of features to be selected
-    :param feats: type of feats (words, chars, affixes, POS, met_line, or met_syll)
+    :param feats: the type of features, one of 'words', 'chars', 'affixes, 'lemma', 'pos', 'met_line' and 'met_syll'.
     :param freqsType: "relative", "absolute" or "binary" frequencies
     :param n: n-grams length
     :return: list of features, with total frequency
@@ -142,14 +128,12 @@ def get_doc_frequency(myTexts):
     return feats_doc_freq
 
 
-
-
 def get_counts(myTexts, feat_list=None, feats = "words", n = 1, freqsType = "relative"):
     """
     Get counts for a collection of texts
     :param myTexts: the document collection
     :param feat_list: a list of features to be selected (None for all)
-    :param feats: the type of feats (words, chars, affixes, POS)
+    :param feats: the type of features, one of 'words', 'chars', 'affixes, 'lemma', 'pos', 'met_line' and 'met_syll'.
     :param n: the length of n-grams
     :param freqsType: relative, absolute or binarised freqs
     :return: the collection with, for each text, a 'wordCounts' dictionary
diff --git a/superstyl/preproc/pipe.py b/superstyl/preproc/pipe.py
index c86949da..81819a00 100755
--- a/superstyl/preproc/pipe.py
+++ b/superstyl/preproc/pipe.py
@@ -52,6 +52,133 @@ def XML_to_text(path):
         return aut, re.sub(r"\s+", " ", str(myxsl(my_doc)))
 
 
+def txm_to_units(path, units="lines"):
+    #TODO: it would be fairly easy to implement lemma and pos feats, like for tei. If it is ever useful
+    myxsl = etree.XML('''<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+    xmlns:tei="http://www.tei-c.org/ns/1.0" xmlns:txm="http://textometrie.org/1.0" 
+    version="1.0">
+
+    <xsl:output method="text"/>
+    <xsl:param name="units"></xsl:param>
+
+    <xsl:template match="/">
+        <xsl:choose>
+            <xsl:when test="$units = 'verses'">
+                <xsl:apply-templates select="descendant::tei:l"/>
+            </xsl:when>
+            <xsl:when test="$units = 'words'">
+                <xsl:apply-templates select="descendant::tei:w"/>
+            </xsl:when>
+        </xsl:choose>
+    </xsl:template>
+
+    <xsl:template match="tei:l">
+        <xsl:apply-templates select="descendant::tei:w[
+            not(txm:ana[@type='#frpos'] = 'NOMpro')
+            ]"/>
+        <xsl:text>&#xA;</xsl:text>
+    </xsl:template>
+
+    <xsl:template match="tei:w">
+        <xsl:text> </xsl:text>
+        <xsl:apply-templates select="txm:form"/>
+    </xsl:template>
+
+</xsl:stylesheet>''')
+    myxsl = etree.XSLT(myxsl)
+
+    with open(path, 'r') as f:
+        my_doc = etree.parse(f)
+
+    #units_tokens = str(myxsl(my_doc, units=etree.XSLT.strparam(units), feats=etree.XSLT.strparam(feats))).splitlines()
+    units_tokens = str(myxsl(my_doc, units=etree.XSLT.strparam(units))).splitlines()
+    return units_tokens
+
+def tei_to_units(path, feats="words", units="lines"):
+
+    if feats in ["met_syll", "met_line"]:
+        feats = "met"
+    myxsl = etree.XML('''<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+        xmlns:tei="http://www.tei-c.org/ns/1.0"  
+        version="1.0">
+
+        <xsl:output method="text"/>
+
+        <xsl:param name="units"></xsl:param>
+        <xsl:param name="feats"></xsl:param>
+        <xsl:param name="keep_punct"></xsl:param>
+
+        <xsl:template match="/">
+            <xsl:choose>
+                <xsl:when test="$units = 'verses'">
+                    <xsl:apply-templates select="descendant::tei:l"/>
+                </xsl:when>
+                <xsl:when test="$units = 'words'">
+                    <xsl:apply-templates select="descendant::tei:w"/>
+                </xsl:when>
+            </xsl:choose>
+        </xsl:template>
+
+        <xsl:template match="tei:l">
+            <xsl:choose>
+                <xsl:when test="$feats = 'met'">
+                        <xsl:choose>
+                            <xsl:when test="$keep_punct = 'true'">
+                                <xsl:value-of select="@met"/>
+                            </xsl:when>
+                            <xsl:otherwise>
+                                <xsl:value-of select="translate(@met, '.', '')"/>
+                            </xsl:otherwise>
+                        </xsl:choose>
+                </xsl:when>
+                <xsl:otherwise>
+                    <xsl:apply-templates select="descendant::tei:w"/>
+                </xsl:otherwise>
+            </xsl:choose>
+            <xsl:text>&#xA;</xsl:text>
+        </xsl:template>
+
+        <xsl:template match="tei:w">
+            <xsl:text> </xsl:text>
+            <xsl:choose>
+                <xsl:when test="$feats = 'met'">
+                    <xsl:value-of select="@met"/>
+                </xsl:when>
+                <xsl:when test="$feats = 'lemma'">
+                    <xsl:value-of select="@lemma"/>
+                </xsl:when>
+                <xsl:when test="$feats = 'pos'">
+                    <xsl:value-of select="@pos"/>
+                </xsl:when>
+                <xsl:otherwise>
+                    <xsl:apply-templates/>
+                </xsl:otherwise>
+            </xsl:choose>
+            <xsl:if test="$units = 'words'">
+                <!-- Then one word per line -->
+                <xsl:text>&#xA;</xsl:text>
+            </xsl:if>
+        </xsl:template>
+
+    </xsl:stylesheet>''')
+    myxsl = etree.XSLT(myxsl)
+
+    with open(path, 'r') as f:
+        my_doc = etree.parse(f)
+
+    units_tokens = str(myxsl(my_doc, units=etree.XSLT.strparam(units), feats=etree.XSLT.strparam(feats))).splitlines()
+    return units_tokens
+
+def specialXML_to_text(path, format="tei", feats="words"):
+    aut = path.split('/')[-1].split("_")[0]
+    if format=="tei":
+        units_tokens = tei_to_units(path, feats=feats, units="words")
+
+    if format=="txm":
+        units_tokens = txm_to_units(path, feats=feats, units="words")
+
+    return aut, re.sub(r"\s+", " ", str(' '.join(units_tokens)))
+
 def TXT_to_text(path):
     """
     Get main text from xml file
@@ -147,7 +274,7 @@ def load_texts(paths, identify_lang=False, feats="words", format="txt", keep_pun
     Loads a collection of documents into a 'myTexts' object for further processing.
     TODO: a proper class
     :param paths: path to docs
-    TODO: add feats!
+    :param feats: the type of features, one of 'words', 'chars', 'affixes, 'lemma', 'pos', 'met_line' and 'met_syll'.
     :param identify_lang: whether or not try to identify lang (default: False)
     :param format: format of the source files (implemented values: txt [default], xml)
     :param keep_punct: whether or not to keep punctuation and caps.
@@ -165,6 +292,9 @@ def load_texts(paths, identify_lang=False, feats="words", format="txt", keep_pun
         if format=='xml':
             aut, text = XML_to_text(path)
 
+        if format in ('tei', 'txm'):
+            aut, text = specialXML_to_text(path, format=format, feats=feats)
+
         else:
             aut, text = TXT_to_text(path)
 
@@ -198,7 +328,7 @@ def get_samples(path, size, step=None, samples_random=False, max_samples=10,
     :param max_samples: maximum number of samples per author/clas
     :param units: the units to use, one of "words" or "verses"
     :param format: type of document, one of full text, TEI or simple XML (ONLY TEI and TXT IMPLEMENTED)
-    :param feats: the type of features,  TODO: document
+    :param feats: the type of features, one of 'words', 'chars', 'affixes, 'lemma', 'pos', 'met_line' and 'met_syll'.
     """
 
     if samples_random and step is not None:
@@ -213,127 +343,26 @@ def get_samples(path, size, step=None, samples_random=False, max_samples=10,
     if units == "words" and format == "txt":
         my_doc = TXT_to_text(path)
         text = normalise(my_doc[1], keep_punct=keep_punct, keep_sym=keep_sym, no_ascii=no_ascii)
-        units = nltk.tokenize.wordpunct_tokenize(text)
+        units_tokens = nltk.tokenize.wordpunct_tokenize(text)
 
-    #TODO: DOCUMENT this format as TXM, and keep it only for retrocompatibility
+    #Kept only for retrocompatibility with Psysché
     if units == "verses" and format == "txm":
-        myxsl = etree.XML('''<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
-        xmlns:tei="http://www.tei-c.org/ns/1.0" xmlns:txm="http://textometrie.org/1.0" 
-        version="1.0">
-
-        <xsl:output method="text"/>
-
-        <xsl:template match="/">
-            <xsl:apply-templates select="descendant::tei:l"/>
-        </xsl:template>
-
-        <xsl:template match="tei:l">
-            <xsl:apply-templates select="descendant::tei:w[
-                not(txm:ana[@type='#frpos'] = 'NOMpro')
-                ]"/>
-            <xsl:text>&#xA;</xsl:text>
-        </xsl:template>
-
-        <xsl:template match="tei:w">
-            <xsl:text> </xsl:text>
-            <xsl:apply-templates select="txm:form"/>
-        </xsl:template>
-
-    </xsl:stylesheet>''')
-        myxsl = etree.XSLT(myxsl)
-
-        with open(path, 'r') as f:
-            my_doc = etree.parse(f)
-
-        units = str(myxsl(my_doc)).splitlines()
-
-    # and now generating output
-    samples = []
+        units_tokens = txm_to_units(path, units=units)
 
     if format == "tei":
-        if feats in ["met_syll", "met_line"]:
-            feats = "met"
-        myxsl = etree.XML('''<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
-    xmlns:tei="http://www.tei-c.org/ns/1.0"  
-    version="1.0">
-    
-    <xsl:output method="text"/>
-    
-    <xsl:param name="units"></xsl:param>
-    <xsl:param name="feats"></xsl:param>
-    <xsl:param name="keep_punct"></xsl:param>
-    
-    <xsl:template match="/">
-        <xsl:choose>
-            <xsl:when test="$units = 'verses'">
-                <xsl:apply-templates select="descendant::tei:l"/>
-            </xsl:when>
-            <xsl:when test="$units = 'words'">
-                <xsl:apply-templates select="descendant::tei:w"/>
-            </xsl:when>
-        </xsl:choose>
-    </xsl:template>
-    
-    <xsl:template match="tei:l">
-        <xsl:choose>
-            <xsl:when test="$feats = 'met'">
-                    <xsl:choose>
-                        <xsl:when test="$keep_punct = 'true'">
-                            <xsl:value-of select="@met"/>
-                        </xsl:when>
-                        <xsl:otherwise>
-                            <xsl:value-of select="translate(@met, '.', '')"/>
-                        </xsl:otherwise>
-                    </xsl:choose>
-            </xsl:when>
-            <xsl:otherwise>
-                <xsl:apply-templates select="descendant::tei:w"/>
-            </xsl:otherwise>
-        </xsl:choose>
-        <xsl:text>&#xA;</xsl:text>
-    </xsl:template>
-    
-    <xsl:template match="tei:w">
-        <xsl:text> </xsl:text>
-        <xsl:choose>
-            <xsl:when test="$feats = 'met'">
-                <xsl:value-of select="@met"/>
-            </xsl:when>
-            <xsl:when test="$feats = 'lemma'">
-                <xsl:value-of select="@lemma"/>
-            </xsl:when>
-            <xsl:when test="$feats = 'pos'">
-                <xsl:value-of select="@pos"/>
-            </xsl:when>
-            <xsl:otherwise>
-                <xsl:apply-templates/>
-            </xsl:otherwise>
-        </xsl:choose>
-        <xsl:if test="$units = 'words'">
-            <!-- Then one word per line -->
-            <xsl:text>&#xA;</xsl:text>
-        </xsl:if>
-    </xsl:template>
-    
-</xsl:stylesheet>''')
-        myxsl = etree.XSLT(myxsl)
-
-        with open(path, 'r') as f:
-            my_doc = etree.parse(f)
-
-        units = str(myxsl(my_doc, units=etree.XSLT.strparam(units), feats=etree.XSLT.strparam(feats))).splitlines()
+        units_tokens = tei_to_units(path, units=units, feats=feats)
 
     # and now generating output
     samples = []
 
     if samples_random:
         for k in range(max_samples):
-            samples.append({"start": str(k)+'s', "end": str(k)+'e', "text": list(random.choices(units, k=size))})
+            samples.append({"start": str(k)+'s', "end": str(k)+'e', "text": list(random.choices(units_tokens, k=size))})
 
     else:
         current = 0
-        while current + size <= len(units):
-            samples.append({"start": current, "end": current + size, "text": list(units[current:(current + size)])})
+        while current + size <= len(units_tokens):
+            samples.append({"start": current, "end": current + size, "text": list(units_tokens[current:(current + size)])})
             current = current + step
 
     return samples
@@ -353,7 +382,7 @@ def docs_to_samples(paths, size, step=None, units="words", samples_random=False,
     :param keep_punct: whether to keep punctuation and caps.
     :param max_samples: maximum number of samples per author/class.
     :param identify_lang: whether to try to identify lang (default: False)
-    :param feats: TODO
+    :param feats: the type of features, one of 'words', 'chars', 'affixes, 'lemma', 'pos', 'met_line' and 'met_syll'.
     :return: a myTexts object
     """
     myTexts = []

From 290286ca607314af56c8bb0fe4a344bfa9609ac8 Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Camps <jbcamps@hotmail.com>
Date: Thu, 20 Nov 2025 15:12:08 +0100
Subject: [PATCH 06/10] fixed typos

---
 superstyl/load.py                     | 4 ++--
 superstyl/preproc/features_extract.py | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/superstyl/load.py b/superstyl/load.py
index 7c50a790..fe52683e 100644
--- a/superstyl/load.py
+++ b/superstyl/load.py
@@ -46,10 +46,10 @@ def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, freqsTyp
     :return a pandas dataFrame of text metadata and feature frequencies; a global list of features with their frequencies
     """
 
-    if feats in ('lemma', 'pos', 'met_line', 'met_syll') and format is not 'tei':
+    if feats in ('lemma', 'pos', 'met_line', 'met_syll') and format != 'tei':
         raise ValueError("lemma, pos, met_line or met_syll are only possible with adequate tei format (@lemma, @pos, @met)")
 
-    if feats in ('met_line', 'met_syll') and units is not 'lines':
+    if feats in ('met_line', 'met_syll') and units != 'lines':
         raise ValueError("met_line or met_syll are only possible with tei format that includes lines and @met")
 
     embeddedFreqs = False
diff --git a/superstyl/preproc/features_extract.py b/superstyl/preproc/features_extract.py
index 3ef055d3..d04d8c12 100755
--- a/superstyl/preproc/features_extract.py
+++ b/superstyl/preproc/features_extract.py
@@ -10,7 +10,7 @@ def count_features(text, feats ="words", n = 1):
     Get feature counts from  a text (words, chars or POS n-grams, or affixes(+punct if keep_punct),
     following Sapkota et al., NAACL 2015
     :param text: the source text
-    :param feats: the type of feats: words, chars, POS (supported only for English), or affixes
+    :param feats: the type of features, one of 'words', 'chars', 'affixes, 'lemma', 'pos', 'met_line' and 'met_syll'.
     :param n: the length of n-grams
     :return: features absolute frequencies in text as a counter, and the total of frequencies
     """
@@ -20,9 +20,9 @@ def count_features(text, feats ="words", n = 1):
         raise ValueError("Text cannot be empty.")
     if n < 1 or not isinstance(n, int):
         raise ValueError("n must be a positive integer.")
-    if feats not in ["words", "chars", "affixes", "lemmas", "pos", "met_line", "met_syll"]:
-        raise ValueError("Unsupported feature type. Choose from 'words', 'chars', 'affixes', 'met_line', 'met_syll', 'lemmas' or 'pos'.")
-    if feats in ("words", "lemmas", "pos"):
+    if feats not in ["words", "chars", "affixes", "lemma", "pos", "met_line", "met_syll"]:
+        raise ValueError("Unsupported feature type. Choose from 'words', 'chars', 'affixes', 'met_line', 'met_syll', 'lemma' or 'pos'.")
+    if feats in ("words", "lemma", "pos"):
         tokens = nltk.tokenize.wordpunct_tokenize(text)
         if n > 1:
             tokens = ["_".join(t) for t in list(nltk.ngrams(tokens, n))]

From 8008860198bdce542d05cd6920fd6030c77473ee Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Camps <jbcamps@hotmail.com>
Date: Thu, 20 Nov 2025 15:20:24 +0100
Subject: [PATCH 07/10] suppressed test for pos tagging

---
 load_corpus.py            | 11 ++++++-----
 tests/test_load_corpus.py | 17 +----------------
 2 files changed, 7 insertions(+), 21 deletions(-)

diff --git a/load_corpus.py b/load_corpus.py
index f325f30d..668e7524 100755
--- a/load_corpus.py
+++ b/load_corpus.py
@@ -18,18 +18,19 @@
     parser.add_argument('-f', action="store", help="optional list of features, either in json (generated by"
                                                    " Superstyl) or simple txt (one word per line)", default=False)
     parser.add_argument('-t', action='store', help="types of features (words, chars, affixes - "
-                                                   "as per Sapkota et al. 2015 - or pos). pos are currently"
-                                                   "only implemented for Modern English", type=str,
-                        default="words", choices=["words", "chars", "affixes", "pos"])
+                                                   "as per Sapkota et al. 2015 -, as well as lemma or pos, met_line, "
+                                                   "met_syll (those four last only for TEI files with proper annotation)"
+                                                   , type=str,
+                        default="words", choices=["words", "chars", "affixes", "pos", "lemma", "met_line", "met_syll"])
     parser.add_argument('-n', action='store', help="n grams lengths (default 1)", default=1, type=int)
     parser.add_argument('-k', action='store', help="How many most frequent?", default=5000, type=int)
     parser.add_argument('--freqs', action='store', help="relative, absolute or binarised freqs",
                         default="relative",
                         choices=["relative", "absolute", "binary"]
                         )
-    parser.add_argument('-x', action='store', help="format (txt, xml or tei) WARNING: only txt is fully implemented",
+    parser.add_argument('-x', action='store', help="format (txt, xml, tei, or txm) WARNING: only txt is fully implemented",
                         default="txt",
-                        choices=["txt", "xml", "tei"]
+                        choices=["txt", "xml", "tei", 'txm']
                         )
     parser.add_argument('--sampling', action='store_true', help="Sample the texts?", default=False)
     parser.add_argument('--sample_units', action='store', help="Units of length for sampling "
diff --git a/tests/test_load_corpus.py b/tests/test_load_corpus.py
index 5de65d26..076aa199 100644
--- a/tests/test_load_corpus.py
+++ b/tests/test_load_corpus.py
@@ -286,22 +286,7 @@ def test_load_corpus(self):
         self.assertEqual(sorted(feats), sorted(expected_feats))
         self.assertEqual(corpus.to_dict(), expected_corpus)
 
-        # WHEN
-        corpus, feats = superstyl.load.load_corpus(sorted(self.paths[1:]), feats="pos", n=1, format="txt", freqsType="absolute")
-
-        # THEN
-        expected_feats = [('DT', 4), ('NN', 2), ('VBZ', 2), ('RB', 1)]
-        expected_corpus = {
-        'author': {'Smith_Letter1.txt': 'Smith', 'Smith_Letter2.txt': 'Smith'},
-        'lang': {'Smith_Letter1.txt': 'NA', 'Smith_Letter2.txt': 'NA'},
-        'DT': {'Smith_Letter1.txt': 2 , 'Smith_Letter2.txt': 2},
-        'NN': {'Smith_Letter1.txt': 1 , 'Smith_Letter2.txt': 1},  
-        'VBZ': {'Smith_Letter1.txt': 1, 'Smith_Letter2.txt': 1},
-        'RB': {'Smith_Letter1.txt': 0, 'Smith_Letter2.txt': 1}
-        }
-
-        self.assertEqual(sorted(feats), sorted(expected_feats))
-        self.assertEqual(corpus.to_dict(), expected_corpus)
+        # TODO: add tests for lemma, pos, met_line, met_syll, and loading from tei, and from txm
 
         # Now, test embedding
         # WHEN

From e58cf74d0678afd00504f5629ed16f5e44acf182 Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Camps <jbcamps@hotmail.com>
Date: Thu, 20 Nov 2025 15:22:08 +0100
Subject: [PATCH 08/10] cleaned imports

---
 superstyl/load_from_config.py | 2 --
 superstyl/preproc/pipe.py     | 2 --
 2 files changed, 4 deletions(-)

diff --git a/superstyl/load_from_config.py b/superstyl/load_from_config.py
index 5dc7d3a6..220b0b21 100644
--- a/superstyl/load_from_config.py
+++ b/superstyl/load_from_config.py
@@ -1,7 +1,5 @@
 import json
-import superstyl
 import pandas as pd
-import os
 import glob
 
 from superstyl.load import load_corpus
diff --git a/superstyl/preproc/pipe.py b/superstyl/preproc/pipe.py
index 81819a00..811451cd 100755
--- a/superstyl/preproc/pipe.py
+++ b/superstyl/preproc/pipe.py
@@ -1,5 +1,3 @@
-import unicodedata
-
 from lxml import etree
 import regex as re
 import unidecode

From 05885dfb05ae814b1d238d3657494910c57a0e84 Mon Sep 17 00:00:00 2001
From: Theo <theo.moins@gmail.com>
Date: Thu, 27 Nov 2025 13:36:24 +0100
Subject: [PATCH 09/10] fix column rename

---
 superstyl/load_from_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/superstyl/load_from_config.py b/superstyl/load_from_config.py
index 5e3bfb3e..4bc7cca4 100644
--- a/superstyl/load_from_config.py
+++ b/superstyl/load_from_config.py
@@ -181,7 +181,7 @@ def load_corpus_from_config(config_path, is_test=False):
         feature_cols = [col for col in corpus.columns if col not in ['author', 'lang']]
         
         # Rename columns to avoid duplicates
-        renamed_cols = {col: col for col in feature_cols}
+        renamed_cols = {col: f"{name}_{col}" for col in feature_cols}
         feature_df = corpus[feature_cols].rename(columns=renamed_cols)
         
         # Merge with the main DataFrame

From 011785119cf8ef4ac5ae5b6339c80679f0be1914 Mon Sep 17 00:00:00 2001
From: Theo <theo.moins@gmail.com>
Date: Thu, 27 Nov 2025 15:44:07 +0100
Subject: [PATCH 10/10] Add feats option to the txm extraction function + add
 unit tests

---
 .coverage                          | Bin 0 -> 53248 bytes
 superstyl/preproc/pipe.py          |  30 ++-
 tests/test_error_handling.py       | 320 +++++++++++++++++++++++++++++
 tests/test_xml_loading.py          | 304 +++++++++++++++++++++++++++
 tests/testdata/Dupont_TEIPoem1.xml |  30 +++
 tests/testdata/Smith_Song1.xml     |   5 +
 tests/testdata/Smith_TXM1.xml      |  55 +++++
 7 files changed, 739 insertions(+), 5 deletions(-)
 create mode 100644 .coverage
 create mode 100644 tests/test_error_handling.py
 create mode 100644 tests/test_xml_loading.py
 create mode 100644 tests/testdata/Dupont_TEIPoem1.xml
 create mode 100644 tests/testdata/Smith_Song1.xml
 create mode 100644 tests/testdata/Smith_TXM1.xml

diff --git a/.coverage b/.coverage
new file mode 100644
index 0000000000000000000000000000000000000000..cb8c5ebec31f2b245cb66d3681fec473c5d7d5c4
GIT binary patch
literal 53248
zcmeI4eQXow9mntNvmL+Pfg<YBw9R!I4S~cV+sa5;p=_Xzjlm!cRzh*l_Qko(_L;kL
zNF)YvI|;Q?wGAq@Fv_Y<8+2Q{X^j4{_o~ufrB>B6snaI4Y+dPw*rtx5B51tdb1$(S
zlBzO6sLl6e-*b1*^L+33e4gL)JbrB7)4zFxW~uU^ZWI(t{s=dZ<9Y5bS>`w{Lhm@e
zy)8rsf_Fk8Uw1syVT4PLJQ$a@ak1dToU|*xN$QRrjo%i1DOLzS6g?JF!WoLd1_B@e
z0w6HA2~;-6LWzY7`TM_QDe1gw8A?VqoPGS0Th^`Fx=!A@<`Wy%$<8%-aYUxAyIWo(
z8~Pr(q#E*|mRDu1nAI|hr4@6sHKcl}%ch!T107wXp^ha?+WFa0O3P9%OU;o)$<PXl
zF(!Xe9qY0axOcPEQOmhNBC3`vvJ~=SH+!dSsDrAZ7Bi~pSg}~kc1AiM*dGZcT3h)8
zs$C|<$j~d=P#U^WE6Vyo63XaCmRX-RlwxK`HM?YGvV<9(>>Rb6(i(c9$I|7zR<w&*
zG&M`pi?TYZX3CbDowk5o44StQy!z6R)_tr%_c=M;d)3jYWu7Ldo7~MjNX8X)4{I$W
zyESFDZ7!AzX+l0+E|gg1P!8wDmOfL?netk?TiAD9H`CN*t-KA@=^<w|>f|aHwcTZP
zvhus+TFx%J7)8BHW`!URSQHH=`dWF8yiy%CcjxKzR!S*bx_zFa%1`yuF81a5Eulnv
zJAYue?I;XSDy>=O<UzQ>X(lhv;5saRX1`&rq|r%OE6V3(eb8<)EVt}b4taBZo9Xdj
zC<be2vhF4%ORuZ9F=NRFIi1(jjlO1kjHyxsp}xRNHdroZSW_`s88Z5aYA881>Bc(U
z!u56}a-H6AFtM(ECfAvCfmHA0VWhG!7)rFY@%M*qH*#ys4MGhr<kjRf-KZB;P!+Rm
zcpb?*b&_6=S}*Cf5jTd4(;W&X*0jx395-F6*F9*dgoIFH!2-VGv<TKOv3H=M5bR2=
z2}~IX-o5!H#Tt^Yw4Gfh!#e`O#L5LT8O}^d^|F2NfdQXw@_~gm{Q`CR?CozbnO&)w
z{26tudql}oZ&Fe;gPJV$!OjREvxHfN5-VxFY}x+HPHp#OV^7UqN1%4@v8Pm>c5-Uo
zpVLiKqhXZ|Tjm|!Rk>qGF<CEUcF1X6&#Ow&85=0adXw&{nX+xem7~(?nua;XG;$*y
zwoTsP!n*mb)abnSMF)j~Qp~#J6&r>fRqFd$<7D@7IuXh!cTl4=sfkX}^hDiQ)?<14
z$r3SPss$zMG<Q}hb&7k2)GOqi(P*t=sg_nyWtPM?#&)bdiYaFdl_F^@?d<TslN+$w
zO7*fmn<clfk8+iEK9pFxl&_TC#_04jDU$~Eg5sI&p86V`WJ+YsUp7&POP_Y?{e-cr
z!@*9SJFw2psll{E!As?S)`@ho2|h1f;^>791V8`;KmY_l00ck)1V8`;KmY_l;QdFy
z&-?fgoB#Wy)135g+F=6$5C8!X009sH0T2KI5C8!X009tqZxV?4{5MP9Up$scyuU5s
z{t4h?$rZ_E>`(qa=?zYLL;Bx)vw*M~1V8`;KmY_l00ck)1V8`;KmY_lz#|az-^_d8
z0{9~Swy^sp0Q>!4jCOL;_oST?C!P@3#$S&=7{4R-&)C7(is+xC`zaC|2!H?xfB*=9
z00@8p2+TDCTSbAZboUJD1+~X2(Bchd&ssfGE~rJzq%{!=C3?h{EhwPm<<gLr*YkRA
ztf#+BYm)R^WBDFSH7On2r)W75t(Y=L3iP=0jyS3IHLYqgeUs_ubtRi(>u03s>G?q|
zN7@51(q7rLcAdr3h<_nU;<q;~UNgL8s3k+s^q6X1%~)jiwg@TrHmy9>Y+6dCR5nR>
zW0-Vry-M9wN-JtsDn)vqZ6Uq(ruAwB98VYskzDh&Rg?38s<g67maQT~w(bm)_?@$0
zt0&G@8X!+A7g>#^1Ejuw7S#vofqPnY!c5U!Xbm9SD+Ibn^ImZ!o|0Bl$@U$7(w&V4
z=#i=Okc~dlZr<Z+Hna9ewr=4`uX7eS*V)K^{}-c6IO%EW-srf*#~%{ciBHA);%DM3
zV`pQBqJN=CY#;yvAOHd&00JNY0w6Hw2z*|sbn{Jr_^z9l*3bXN0in{@ylUeVVaohJ
zzEh~IY+k$0;%Vmpv3rEd?ahnV4EN?ulk@-RcA?VSyz*4DC+Gi>O+w|?tJSTWIfXY0
zw7gr>4pAfE%>P@iuC{7&9?<&vf9PJJa_4N=>WMeZ|AQH!vVJzzr<wl;uFfm2#N*EY
zh1qI=9+^51@n4<C)oga>|Gq6kr87JW9#>~0*8iJx0~rbo0w4eaAOHd&00JNY0w4ea
zATYNG_<27UXTSgR(q)cb*gyaTKmY_l00ck)1V8`;KmY_l00iEL1pNMZ*na+BT**ln
zrB|hwrN^a;G%9J*aw#ePMtnvz#eRy!1_B@e0w4eaAOHd&00JNY0w6G#3EUtGd^jmi
z9I2kYTl(uakN>)Q{#%y|*`>F{>H4bpE}m;^ziCO|nbp;5?(o=nwc7vb4F|gpE$E9;
zycL`HVE4s+?UzG)qJR7LvZ9ge`ugMZ51stc$)hJfp1b(`JtN87nK!GKs;763zOee}
z-`ihM|8!XRK$Nn6IC^Xn`?z~Fo{a?faOG2xiS1k8Uj6oqmicCtZ7+Vb^(_ka5BgY8
zz3n;Lz4Geiue|)*AO1esdhX1{>Z6Y)dRD!rEjj$ihsG}a;Tsd@bNep@)}A;u{P2w@
zPAPW}A33t)m@zzj`uo*wsY)<R_ZP#y-WCe8Eff1iZan$53%`1DbKu}(D+U6~=R5Y4
ze)iwjXgvpR@!o0+_ty4jk6uc|LzLyFP~?VX;qia`#}^9H>Ac|1)i(y{V0mCK|DB)z
z<i{MhwDjDfUp{+09lmS$yS_ew;w)jp$5dsu8)rLv6Xq>m^i}S$C!g|d8c?|>7VQ}B
z<$iE(y|pXI?S155-9bNP75%=1kHTfXeYdbo2Y<2Z?BRV+|M4)lg%9vTI4lP3=l`WA
zIO&pfUOFeeF1<qE0=y*sPI_MYsdP$uMmkQB*gyaTKmY_l00ck)1V8`;KmY_l00gcn
z0roWkFN$^$kJ~{kW(U!z9Yi8_5DwcxON$+Z?5`L2V9-7b1nfW%?7;801D{WzF97V{
z|G9bBv<MIf0T2KI5C8!X009sH0T2KI5C8!XxXuLF@Bf(pU*}4|?LhzpKmY_l00ck)
w1V8`;KmY_l;2INP&;Mioe~lqS9t1!D1V8`;KmY_l00ck)1V8`;t}}uE1Io6YN&o-=

literal 0
HcmV?d00001

diff --git a/superstyl/preproc/pipe.py b/superstyl/preproc/pipe.py
index 811451cd..7c675b68 100755
--- a/superstyl/preproc/pipe.py
+++ b/superstyl/preproc/pipe.py
@@ -50,14 +50,21 @@ def XML_to_text(path):
         return aut, re.sub(r"\s+", " ", str(myxsl(my_doc)))
 
 
-def txm_to_units(path, units="lines"):
-    #TODO: it would be fairly easy to implement lemma and pos feats, like for tei. If it is ever useful
+def txm_to_units(path, units="lines", feats="words"):
+    """
+    Extract units from TXM file
+    :param path: path to TXM file
+    :param units: units to extract ("lines"/"verses" or "words")
+    :param feats: features to extract ("words", "lemma", or "pos")
+    :return: list of extracted units
+    """
     myxsl = etree.XML('''<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
     xmlns:tei="http://www.tei-c.org/ns/1.0" xmlns:txm="http://textometrie.org/1.0" 
     version="1.0">
 
     <xsl:output method="text"/>
     <xsl:param name="units"></xsl:param>
+    <xsl:param name="feats"></xsl:param>
 
     <xsl:template match="/">
         <xsl:choose>
@@ -79,7 +86,21 @@ def txm_to_units(path, units="lines"):
 
     <xsl:template match="tei:w">
         <xsl:text> </xsl:text>
-        <xsl:apply-templates select="txm:form"/>
+        <xsl:choose>
+            <xsl:when test="$feats = 'lemma'">
+                <xsl:value-of select="txm:lemma"/>
+            </xsl:when>
+            <xsl:when test="$feats = 'pos'">
+                <xsl:value-of select="txm:ana[@type='#frpos']"/>
+            </xsl:when>
+            <xsl:otherwise>
+                <xsl:apply-templates select="txm:form"/>
+            </xsl:otherwise>
+        </xsl:choose>
+        <xsl:if test="$units = 'words'">
+            <!-- Then one word per line -->
+            <xsl:text>&#xA;</xsl:text>
+        </xsl:if>
     </xsl:template>
 
 </xsl:stylesheet>''')
@@ -88,8 +109,7 @@ def txm_to_units(path, units="lines"):
     with open(path, 'r') as f:
         my_doc = etree.parse(f)
 
-    #units_tokens = str(myxsl(my_doc, units=etree.XSLT.strparam(units), feats=etree.XSLT.strparam(feats))).splitlines()
-    units_tokens = str(myxsl(my_doc, units=etree.XSLT.strparam(units))).splitlines()
+    units_tokens = str(myxsl(my_doc, units=etree.XSLT.strparam(units), feats=etree.XSLT.strparam(feats))).splitlines()
     return units_tokens
 
 def tei_to_units(path, feats="words", units="lines"):
diff --git a/tests/test_error_handling.py b/tests/test_error_handling.py
new file mode 100644
index 00000000..cc37116a
--- /dev/null
+++ b/tests/test_error_handling.py
@@ -0,0 +1,320 @@
+import unittest
+import superstyl.load
+import superstyl.preproc.features_extract
+from superstyl.load_from_config import load_corpus_from_config
+import os
+import tempfile
+import json
+import glob
+
+THIS_DIR = os.path.dirname(os.path.abspath(__file__))
+
+
+class ErrorHandlingTests(unittest.TestCase):
+    """Tests for error handling and ValueError raising"""
+    
+    def setUp(self):
+        """Set up test files paths"""
+        self.test_paths = sorted(glob.glob(os.path.join(THIS_DIR, "testdata/*.txt")))
+        self.temp_dir = tempfile.TemporaryDirectory()
+    
+    def tearDown(self):
+        """Clean up temporary directory"""
+        self.temp_dir.cleanup()
+    
+    # =========================================================================
+    # Tests pour load.py - ValueError pour formats incompatibles
+    # =========================================================================
+    
+    def test_load_corpus_lemma_requires_tei(self):
+        # SCENARIO: lemma features require TEI format
+        # GIVEN: Attempting to use lemma with non-TEI format
+        
+        # WHEN/THEN: Should raise ValueError
+        with self.assertRaises(ValueError) as context:
+            superstyl.load.load_corpus(
+                self.test_paths,
+                feats="lemma",
+                format="txt"
+            )
+        
+        self.assertIn("lemma", str(context.exception))
+        self.assertIn("tei", str(context.exception).lower())
+    
+    def test_load_corpus_pos_requires_tei(self):
+        # SCENARIO: pos features require TEI format
+        # GIVEN: Attempting to use pos with non-TEI format
+        
+        # WHEN/THEN: Should raise ValueError
+        with self.assertRaises(ValueError) as context:
+            superstyl.load.load_corpus(
+                self.test_paths,
+                feats="pos",
+                format="txt"
+            )
+        
+        self.assertIn("pos", str(context.exception))
+        self.assertIn("tei", str(context.exception).lower())
+    
+    def test_load_corpus_met_line_requires_tei(self):
+        # SCENARIO: met_line features require TEI format
+        # GIVEN: Attempting to use met_line with non-TEI format
+        
+        # WHEN/THEN: Should raise ValueError
+        with self.assertRaises(ValueError) as context:
+            superstyl.load.load_corpus(
+                self.test_paths,
+                feats="met_line",
+                format="txt"
+            )
+        
+        self.assertIn("met_line", str(context.exception))
+        self.assertIn("tei", str(context.exception).lower())
+    
+    def test_load_corpus_met_syll_requires_tei(self):
+        # SCENARIO: met_syll features require TEI format
+        # GIVEN: Attempting to use met_syll with non-TEI format
+        
+        # WHEN/THEN: Should raise ValueError
+        with self.assertRaises(ValueError) as context:
+            superstyl.load.load_corpus(
+                self.test_paths,
+                feats="met_syll",
+                format="txt"
+            )
+        
+        self.assertIn("met_syll", str(context.exception))
+        self.assertIn("tei", str(context.exception).lower())
+    
+    def test_load_corpus_met_line_requires_lines_unit(self):
+        # SCENARIO: met_line requires units='lines'
+        # GIVEN: Attempting to use met_line with units='words'
+        
+        # Create a dummy TEI file for this test
+        tei_path = os.path.join(self.temp_dir.name, "test_met.xml")
+        with open(tei_path, 'w') as f:
+            f.write('<?xml version="1.0"?><TEI xmlns="http://www.tei-c.org/ns/1.0"><text><body><l met="01">test</l></body></text></TEI>')
+        
+        # WHEN/THEN: Should raise ValueError
+        with self.assertRaises(ValueError) as context:
+            superstyl.load.load_corpus(
+                [tei_path],
+                feats="met_line",
+                format="tei",
+                units="words"  # Wrong unit type
+            )
+        
+        self.assertIn("met_line", str(context.exception))
+        self.assertIn("lines", str(context.exception))
+    
+    def test_load_corpus_met_syll_requires_lines_unit(self):
+        # SCENARIO: met_syll requires units='lines'
+        # GIVEN: Attempting to use met_syll with units='words'
+        
+        # Create a dummy TEI file for this test
+        tei_path = os.path.join(self.temp_dir.name, "test_met2.xml")
+        with open(tei_path, 'w') as f:
+            f.write('<?xml version="1.0"?><TEI xmlns="http://www.tei-c.org/ns/1.0"><text><body><l met="01">test</l></body></text></TEI>')
+        
+        # WHEN/THEN: Should raise ValueError
+        with self.assertRaises(ValueError) as context:
+            superstyl.load.load_corpus(
+                [tei_path],
+                feats="met_syll",
+                format="tei",
+                units="words"  # Wrong unit type
+            )
+        
+        self.assertIn("met_syll", str(context.exception))
+        self.assertIn("lines", str(context.exception))
+    
+    # =========================================================================
+    # Tests pour features_extract.py - ValueError pour paramètres invalides
+    # =========================================================================
+    
+    def test_count_features_empty_text(self):
+        # SCENARIO: Empty text should raise ValueError
+        # GIVEN: An empty string as text
+        
+        # WHEN/THEN: Should raise ValueError
+        with self.assertRaises(ValueError) as context:
+            superstyl.preproc.features_extract.count_features(
+                "",  # Empty text
+                feats="words",
+                n=1
+            )
+        
+        self.assertIn("empty", str(context.exception).lower())
+    
+    def test_count_features_invalid_n_zero(self):
+        # SCENARIO: n must be positive
+        # GIVEN: n=0
+        
+        # WHEN/THEN: Should raise ValueError
+        with self.assertRaises(ValueError) as context:
+            superstyl.preproc.features_extract.count_features(
+                "test text",
+                feats="words",
+                n=0  # Invalid n
+            )
+        
+        self.assertIn("positive", str(context.exception).lower())
+    
+    def test_count_features_invalid_n_negative(self):
+        # SCENARIO: n must be positive
+        # GIVEN: n=-1
+        
+        # WHEN/THEN: Should raise ValueError
+        with self.assertRaises(ValueError) as context:
+            superstyl.preproc.features_extract.count_features(
+                "test text",
+                feats="words",
+                n=-1  # Invalid n
+            )
+        
+        self.assertIn("positive", str(context.exception).lower())
+    
+    def test_count_features_invalid_n_not_integer(self):
+        # SCENARIO: n must be an integer
+        # GIVEN: n=1.5 (float)
+        
+        # WHEN/THEN: Should raise ValueError
+        with self.assertRaises(ValueError) as context:
+            superstyl.preproc.features_extract.count_features(
+                "test text",
+                feats="words",
+                n=1.5  # Not an integer
+            )
+        
+        self.assertIn("integer", str(context.exception).lower())
+    
+    def test_count_features_invalid_not_string(self):
+        # SCENARIO: text must be a string
+        # GIVEN: text is not a string (e.g., None)
+        
+        # WHEN/THEN: Should raise ValueError
+        with self.assertRaises(ValueError) as context:
+            superstyl.preproc.features_extract.count_features(
+                None,  # Not a string
+                feats="words",
+                n=1
+            )
+        
+        self.assertIn("string", str(context.exception).lower())
+    
+    def test_count_features_unsupported_feats_type(self):
+        # SCENARIO: feats must be a supported type
+        # GIVEN: An unsupported feats type
+        
+        # WHEN/THEN: Should raise ValueError
+        with self.assertRaises(ValueError) as context:
+            superstyl.preproc.features_extract.count_features(
+                "test text",
+                feats="unsupported_type",  # Invalid feats type
+                n=1
+            )
+        
+        self.assertIn("Unsupported", str(context.exception))
+    
+    def test_get_counts_invalid_frequency_type(self):
+        # SCENARIO: freqsType must be valid
+        # GIVEN: An unsupported frequency type
+        
+        myTexts = [{"name": "test", "text": "test text"}]
+        
+        # WHEN/THEN: Should raise ValueError
+        with self.assertRaises(ValueError) as context:
+            superstyl.preproc.features_extract.get_counts(
+                myTexts,
+                feats="words",
+                freqsType="invalid_type"  # Invalid frequency type
+            )
+        
+        self.assertIn("Unsupported frequency type", str(context.exception))
+    
+    # =========================================================================
+    # Tests pour load_from_config.py - Branches non couvertes
+    # =========================================================================
+    
+    def test_load_from_config_with_json_feature_list(self):
+        # SCENARIO: Load corpus with JSON feature list (ligne 119)
+        # GIVEN: A config with a JSON feature list
+        
+        # Create a JSON feature list
+        feature_list = [["the", 0], ["is", 0]]
+        feature_list_path = os.path.join(self.temp_dir.name, "features.json")
+        with open(feature_list_path, 'w') as f:
+            json.dump(feature_list, f)
+        
+        # Create config
+        config = {
+            "paths": self.test_paths,
+            "format": "txt",
+            "features": [
+                {
+                    "name": "test_feature",
+                    "type": "words",
+                    "n": 1,
+                    "feat_list": feature_list_path  # JSON feature list
+                }
+            ]
+        }
+        
+        config_path = os.path.join(self.temp_dir.name, "config.json")
+        with open(config_path, 'w') as f:
+            json.dump(config, f)
+        
+        # WHEN: Loading corpus from config
+        corpus, features = load_corpus_from_config(config_path)
+        
+        # THEN: Should load successfully with JSON feature list
+        self.assertIsNotNone(corpus)
+        self.assertIsNotNone(features)
+    
+    def test_load_from_config_test_mode_uses_feat_list(self):
+        # SCENARIO: In test mode, use provided feat_list (ligne 156)
+        # GIVEN: A config with feat_list in test mode
+        
+        # Create a JSON feature list
+        feature_list = [["the", 0], ["is", 0], ["text", 0]]
+        feature_list_path = os.path.join(self.temp_dir.name, "test_features.json")
+        with open(feature_list_path, 'w') as f:
+            json.dump(feature_list, f)
+        
+        # Create config with multiple features (triggers is_test logic)
+        config = {
+            "paths": self.test_paths,
+            "format": "txt",
+            "features": [
+                {
+                    "name": "feat1",
+                    "type": "words",
+                    "n": 1,
+                    "feat_list": feature_list_path
+                },
+                {
+                    "name": "feat2",
+                    "type": "chars",
+                    "n": 2,
+                    "feat_list": feature_list_path
+                }
+            ]
+        }
+        
+        config_path = os.path.join(self.temp_dir.name, "multi_config.json")
+        with open(config_path, 'w') as f:
+            json.dump(config, f)
+        
+        # WHEN: Loading corpus from config
+        corpus, features = load_corpus_from_config(config_path, is_test=True)
+        
+        # THEN: Should use the provided feature list
+        self.assertIsNotNone(corpus)
+        self.assertIsNotNone(features)
+        # features should be a list of feature lists
+        self.assertIsInstance(features, list)
+        self.assertEqual(len(features), 2)  # Two feature sets
+
+
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file
diff --git a/tests/test_xml_loading.py b/tests/test_xml_loading.py
new file mode 100644
index 00000000..c874bbb8
--- /dev/null
+++ b/tests/test_xml_loading.py
@@ -0,0 +1,304 @@
+import unittest
+import superstyl.preproc.pipe
+import os
+import glob
+
+THIS_DIR = os.path.dirname(os.path.abspath(__file__))
+
+
+class XMLLoadingTests(unittest.TestCase):
+    """Tests for XML, TEI, and TXM file loading functions"""
+    
+    def setUp(self):
+        """Set up test files paths"""
+        self.xml_path = os.path.join(THIS_DIR, "testdata", "Smith_Song1.xml")
+        self.tei_path = os.path.join(THIS_DIR, "testdata", "Dupont_TEIPoem1.xml")
+        self.txm_path = os.path.join(THIS_DIR, "testdata", "Smith_TXM1.xml")
+    
+    def test_XML_to_text(self):
+        # SCENARIO: Load text from a simple XML file
+        # GIVEN: An XML file with author and text elements
+        
+        # WHEN: Loading the XML file
+        aut, text = superstyl.preproc.pipe.XML_to_text(self.xml_path)
+        
+        # THEN: Author and text are correctly extracted
+        self.assertEqual(aut, "Smith")
+        self.assertIn("test song", text)
+        self.assertIn("lyrics", text)
+        # Check that whitespace is normalized
+        self.assertNotIn("  ", text)
+    
+    def test_tei_to_units_words(self):
+        # SCENARIO: Extract words from a TEI file
+        # GIVEN: A TEI file with annotated words
+        
+        # WHEN: Extracting words as units
+        units_tokens = superstyl.preproc.pipe.tei_to_units(
+            self.tei_path, 
+            feats="words", 
+            units="words"
+        )
+        
+        # THEN: Words are extracted, one per line
+        self.assertIsInstance(units_tokens, list)
+        self.assertGreater(len(units_tokens), 0)
+        # Each word should be on a separate line
+        self.assertIn("This", [u.strip() for u in units_tokens])
+        self.assertIn("is", [u.strip() for u in units_tokens])
+    
+    def test_tei_to_units_verses(self):
+        # SCENARIO: Extract verses (lines) from a TEI file
+        # GIVEN: A TEI file with verse lines
+        
+        # WHEN: Extracting verses as units
+        units_tokens = superstyl.preproc.pipe.tei_to_units(
+            self.tei_path, 
+            feats="words", 
+            units="verses"
+        )
+        
+        # THEN: Each verse is on a separate line
+        self.assertIsInstance(units_tokens, list)
+        # We should have 2 lines in our test file
+        self.assertEqual(len(units_tokens), 2)
+    
+    def test_tei_to_units_lemma(self):
+        # SCENARIO: Extract lemmas from a TEI file
+        # GIVEN: A TEI file with lemma annotations
+        
+        # WHEN: Extracting lemmas
+        units_tokens = superstyl.preproc.pipe.tei_to_units(
+            self.tei_path, 
+            feats="lemma", 
+            units="words"
+        )
+        
+        # THEN: Lemmas are extracted
+        self.assertIsInstance(units_tokens, list)
+        self.assertIn("this", [u.strip() for u in units_tokens])
+        self.assertIn("be", [u.strip() for u in units_tokens])
+    
+    def test_tei_to_units_pos(self):
+        # SCENARIO: Extract POS tags from a TEI file
+        # GIVEN: A TEI file with POS annotations
+        
+        # WHEN: Extracting POS tags
+        units_tokens = superstyl.preproc.pipe.tei_to_units(
+            self.tei_path, 
+            feats="pos", 
+            units="words"
+        )
+        
+        # THEN: POS tags are extracted
+        self.assertIsInstance(units_tokens, list)
+        self.assertIn("DET", [u.strip() for u in units_tokens])
+        self.assertIn("VERB", [u.strip() for u in units_tokens])
+    
+    def test_tei_to_units_met_syll(self):
+        # SCENARIO: Extract metrical syllables from a TEI file
+        # GIVEN: A TEI file with metrical annotations
+        
+        # WHEN: Extracting metrical syllables with met_syll feature
+        units_tokens = superstyl.preproc.pipe.tei_to_units(
+            self.tei_path, 
+            feats="met_syll", 
+            units="verses"
+        )
+        
+        # THEN: Metrical annotations are extracted
+        self.assertIsInstance(units_tokens, list)
+        # The @met attributes should be present
+        self.assertGreater(len(units_tokens), 0)
+    
+    def test_tei_to_units_met_line(self):
+        # SCENARIO: Extract metrical lines from a TEI file
+        # GIVEN: A TEI file with metrical annotations on lines
+        
+        # WHEN: Extracting metrical patterns at line level
+        units_tokens = superstyl.preproc.pipe.tei_to_units(
+            self.tei_path, 
+            feats="met_line", 
+            units="verses"
+        )
+        
+        # THEN: Metrical patterns for each line are extracted
+        self.assertIsInstance(units_tokens, list)
+        self.assertEqual(len(units_tokens), 2)
+        # Should contain the metrical patterns
+        self.assertIn("01010101", units_tokens[0])
+        self.assertIn("10101010", units_tokens[1])
+    
+    def test_txm_to_units_words(self):
+        # SCENARIO: Extract words from a TXM file
+        # GIVEN: A TXM file with annotated words
+        
+        # WHEN: Extracting words as units
+        units_tokens = superstyl.preproc.pipe.txm_to_units(
+            self.txm_path, 
+            units="words"
+        )
+        
+        # THEN: Words are extracted
+        # Note: When extracting individual words (units='words'), 
+        # the NOMpro filter is not applied
+        self.assertIsInstance(units_tokens, list)
+        self.assertGreater(len(units_tokens), 0)
+        text_content = ' '.join(units_tokens)
+        # All words should be present including those with NOMpro
+        self.assertIn("This", text_content)
+        self.assertIn("test", text_content)
+    
+    def test_txm_to_units_verses(self):
+        # SCENARIO: Extract verses from a TXM file
+        # GIVEN: A TXM file with verse lines
+        
+        # WHEN: Extracting verses as units
+        units_tokens = superstyl.preproc.pipe.txm_to_units(
+            self.txm_path, 
+            units="verses"
+        )
+        
+        # THEN: Each verse is extracted and NOMpro words are filtered out
+        self.assertIsInstance(units_tokens, list)
+        self.assertEqual(len(units_tokens), 2)
+        # Check that NOMpro words are excluded in verse mode
+        text_content = ' '.join(units_tokens)
+        self.assertNotIn("here", text_content)  # "here" has NOMpro tag and should be filtered
+        self.assertIn("This", text_content)  # Regular words should be present
+    
+    def test_txm_to_units_lemma(self):
+        # SCENARIO: Extract lemmas from a TXM file
+        # GIVEN: A TXM file with lemma annotations
+        
+        # WHEN: Extracting lemmas
+        units_tokens = superstyl.preproc.pipe.txm_to_units(
+            self.txm_path,
+            units="words",
+            feats="lemma"
+        )
+        
+        # THEN: Lemmas are extracted
+        self.assertIsInstance(units_tokens, list)
+        self.assertIn("be", [u.strip() for u in units_tokens])  # lemma of "is"
+        self.assertIn("this", [u.strip() for u in units_tokens])
+    
+    def test_txm_to_units_pos(self):
+        # SCENARIO: Extract POS tags from a TXM file
+        # GIVEN: A TXM file with POS annotations
+        
+        # WHEN: Extracting POS tags
+        units_tokens = superstyl.preproc.pipe.txm_to_units(
+            self.txm_path,
+            units="words",
+            feats="pos"
+        )
+        
+        # THEN: POS tags are extracted
+        self.assertIsInstance(units_tokens, list)
+        self.assertIn("DET", [u.strip() for u in units_tokens])
+        self.assertIn("VERB", [u.strip() for u in units_tokens])
+    
+    def test_specialXML_to_text_tei(self):
+        # SCENARIO: Load text from a TEI file using specialXML_to_text
+        # GIVEN: A TEI format file
+        
+        # WHEN: Loading with format="tei"
+        aut, text = superstyl.preproc.pipe.specialXML_to_text(
+            self.tei_path, 
+            format="tei", 
+            feats="words"
+        )
+        
+        # THEN: Author is extracted from filename and text is normalized
+        self.assertEqual(aut, "Dupont")
+        self.assertIsInstance(text, str)
+        self.assertGreater(len(text), 0)
+        # Check that whitespace is normalized (single spaces)
+        self.assertNotIn("  ", text)
+    
+    def test_specialXML_to_text_txm(self):
+        # SCENARIO: Load text from a TXM file using specialXML_to_text
+        # GIVEN: A TXM format file
+        
+        # WHEN: Loading with format="txm"
+        aut, text = superstyl.preproc.pipe.specialXML_to_text(
+            self.txm_path, 
+            format="txm", 
+            feats="words"
+        )
+        
+        # THEN: Author is extracted from filename and text is normalized
+        self.assertEqual(aut, "Smith")
+        self.assertIsInstance(text, str)
+        self.assertGreater(len(text), 0)
+        # Text should contain words from the TXM file
+        self.assertIn("test", text.lower())
+    
+    def test_specialXML_to_text_with_lemma(self):
+        # SCENARIO: Load lemmas from a TEI file
+        # GIVEN: A TEI file with lemma annotations
+        
+        # WHEN: Loading with feats="lemma"
+        aut, text = superstyl.preproc.pipe.specialXML_to_text(
+            self.tei_path, 
+            format="tei", 
+            feats="lemma"
+        )
+        
+        # THEN: Lemmas are in the text
+        self.assertEqual(aut, "Dupont")
+        self.assertIn("be", text)  # lemma of "is"
+        self.assertIn("this", text)
+    
+    def test_specialXML_to_text_with_pos(self):
+        # SCENARIO: Load POS tags from a TEI file
+        # GIVEN: A TEI file with POS annotations
+        
+        # WHEN: Loading with feats="pos"
+        aut, text = superstyl.preproc.pipe.specialXML_to_text(
+            self.tei_path, 
+            format="tei", 
+            feats="pos"
+        )
+        
+        # THEN: POS tags are in the text
+        self.assertEqual(aut, "Dupont")
+        self.assertIn("DET", text)
+        self.assertIn("VERB", text)
+    
+    def test_specialXML_to_text_txm_with_lemma(self):
+        # SCENARIO: Load lemmas from a TXM file
+        # GIVEN: A TXM file with lemma annotations
+        
+        # WHEN: Loading with feats="lemma"
+        aut, text = superstyl.preproc.pipe.specialXML_to_text(
+            self.txm_path,
+            format="txm",
+            feats="lemma"
+        )
+        
+        # THEN: Lemmas are in the text
+        self.assertEqual(aut, "Smith")
+        self.assertIn("be", text)  # lemma of "is"
+        self.assertIn("this", text)
+    
+    def test_specialXML_to_text_txm_with_pos(self):
+        # SCENARIO: Load POS tags from a TXM file
+        # GIVEN: A TXM file with POS annotations
+        
+        # WHEN: Loading with feats="pos"
+        aut, text = superstyl.preproc.pipe.specialXML_to_text(
+            self.txm_path,
+            format="txm",
+            feats="pos"
+        )
+        
+        # THEN: POS tags are in the text
+        self.assertEqual(aut, "Smith")
+        self.assertIn("DET", text)
+        self.assertIn("VERB", text)
+
+
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file
diff --git a/tests/testdata/Dupont_TEIPoem1.xml b/tests/testdata/Dupont_TEIPoem1.xml
new file mode 100644
index 00000000..a737babd
--- /dev/null
+++ b/tests/testdata/Dupont_TEIPoem1.xml
@@ -0,0 +1,30 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<TEI xmlns="http://www.tei-c.org/ns/1.0">
+    <teiHeader>
+        <fileDesc>
+            <titleStmt>
+                <title>Test Poem</title>
+            </titleStmt>
+        </fileDesc>
+    </teiHeader>
+    <text>
+        <body>
+            <lg>
+                <l met="01010101">
+                    <w lemma="this" pos="DET">This</w>
+                    <w lemma="be" pos="VERB">is</w>
+                    <w lemma="the" pos="DET">the</w>
+                    <w lemma="first" pos="ADJ">first</w>
+                    <w lemma="line" pos="NOUN">line</w>
+                </l>
+                <l met="10101010">
+                    <w lemma="and" pos="CONJ">And</w>
+                    <w lemma="this" pos="DET">this</w>
+                    <w lemma="be" pos="VERB">is</w>
+                    <w lemma="the" pos="DET">the</w>
+                    <w lemma="second" pos="ADJ">second</w>
+                </l>
+            </lg>
+        </body>
+    </text>
+</TEI>
\ No newline at end of file
diff --git a/tests/testdata/Smith_Song1.xml b/tests/testdata/Smith_Song1.xml
new file mode 100644
index 00000000..d713cc96
--- /dev/null
+++ b/tests/testdata/Smith_Song1.xml
@@ -0,0 +1,5 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<song>
+    <author>Smith</author>
+    <text>This is a test song with some lyrics</text>
+</song>
\ No newline at end of file
diff --git a/tests/testdata/Smith_TXM1.xml b/tests/testdata/Smith_TXM1.xml
new file mode 100644
index 00000000..0411bf57
--- /dev/null
+++ b/tests/testdata/Smith_TXM1.xml
@@ -0,0 +1,55 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<TEI xmlns="http://www.tei-c.org/ns/1.0" xmlns:txm="http://textometrie.org/1.0">
+    <teiHeader>
+        <fileDesc>
+            <titleStmt>
+                <title>Test TXM Text</title>
+            </titleStmt>
+        </fileDesc>
+    </teiHeader>
+    <text>
+        <body>
+            <lg>
+                <l>
+                    <w>
+                        <txm:form>This</txm:form>
+                        <txm:lemma>this</txm:lemma>
+                        <txm:ana type="#frpos">DET</txm:ana>
+                    </w>
+                    <w>
+                        <txm:form>is</txm:form>
+                        <txm:lemma>be</txm:lemma>
+                        <txm:ana type="#frpos">VERB</txm:ana>
+                    </w>
+                    <w>
+                        <txm:form>a</txm:form>
+                        <txm:lemma>a</txm:lemma>
+                        <txm:ana type="#frpos">DET</txm:ana>
+                    </w>
+                    <w>
+                        <txm:form>test</txm:form>
+                        <txm:lemma>test</txm:lemma>
+                        <txm:ana type="#frpos">NOUN</txm:ana>
+                    </w>
+                </l>
+                <l>
+                    <w>
+                        <txm:form>Second</txm:form>
+                        <txm:lemma>second</txm:lemma>
+                        <txm:ana type="#frpos">ADJ</txm:ana>
+                    </w>
+                    <w>
+                        <txm:form>line</txm:form>
+                        <txm:lemma>line</txm:lemma>
+                        <txm:ana type="#frpos">NOUN</txm:ana>
+                    </w>
+                    <w>
+                        <txm:form>here</txm:form>
+                        <txm:lemma>here</txm:lemma>
+                        <txm:ana type="#frpos">NOMpro</txm:ana>
+                    </w>
+                </l>
+            </lg>
+        </body>
+    </text>
+</TEI>
\ No newline at end of file