Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added .coverage
Binary file not shown.
11 changes: 6 additions & 5 deletions load_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,19 @@
parser.add_argument('-f', action="store", help="optional list of features, either in json (generated by"
" Superstyl) or simple txt (one word per line)", default=False)
parser.add_argument('-t', action='store', help="types of features (words, chars, affixes - "
"as per Sapkota et al. 2015 - or pos). pos are currently"
"only implemented for Modern English", type=str,
default="words", choices=["words", "chars", "affixes", "pos"])
"as per Sapkota et al. 2015 -, as well as lemma or pos, met_line, "
"met_syll (those four last only for TEI files with proper annotation)"
, type=str,
default="words", choices=["words", "chars", "affixes", "pos", "lemma", "met_line", "met_syll"])
parser.add_argument('-n', action='store', help="n grams lengths (default 1)", default=1, type=int)
parser.add_argument('-k', action='store', help="How many most frequent?", default=5000, type=int)
parser.add_argument('--freqs', action='store', help="relative, absolute or binarised freqs",
default="relative",
choices=["relative", "absolute", "binary"]
)
parser.add_argument('-x', action='store', help="format (txt, xml or tei) WARNING: only txt is fully implemented",
parser.add_argument('-x', action='store', help="format (txt, xml, tei, or txm) WARNING: only txt is fully implemented",
default="txt",
choices=["txt", "xml", "tei"]
choices=["txt", "xml", "tei", 'txm']
)
parser.add_argument('--sampling', action='store_true', help="Sample the texts?", default=False)
parser.add_argument('--sample_units', action='store', help="Units of length for sampling "
Expand Down
16 changes: 11 additions & 5 deletions superstyl/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,11 @@ def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, freqsTyp
Main function to load a corpus from a collection of file, and an optional list of features to extract.
:param data_paths: paths to the source files
:param feat_list: an optional list of features (as created by load_corpus), default None
:param feats: the type of features, one of 'words', 'chars', 'affixes, and 'POS'. Affixes are inspired by
Sapkota et al. 2015, and include space_prefix, space_suffix, prefix, suffix, and, if keep_pos, punctuation n-grams.
POS are currently only implemented for Modern English
TODO: add met_line, met_syll
:param feats: the type of features, one of 'words', 'chars', 'affixes, 'lemma', 'pos', 'met_line' and 'met_syll'.
Affixes are inspired by Sapkota et al. 2015, and include space_prefix, space_suffix, prefix, suffix, and,
if keep_punct, punctuation n-grams. From TEI, pos, lemma, met_line or met_syll can
be extracted; met_line is the prosodic (stress) annotation of a full verse; met_syll is a char n-gram of prosodic
annotation
:param n: n grams lengths (default 1)
:param k: How many most frequent? The function takes the rank of k (if k is smaller than the total number of features),
gets its frequencies, and only include features of superior or equal total frequencies.
Expand Down Expand Up @@ -45,6 +46,12 @@ def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, freqsTyp
:return a pandas dataFrame of text metadata and feature frequencies; a global list of features with their frequencies
"""

if feats in ('lemma', 'pos', 'met_line', 'met_syll') and format != 'tei':
raise ValueError("lemma, pos, met_line or met_syll are only possible with adequate tei format (@lemma, @pos, @met)")

if feats in ('met_line', 'met_syll') and units != 'lines':
raise ValueError("met_line or met_syll are only possible with tei format that includes lines and @met")

embeddedFreqs = False
if embedding:
print(".......loading embedding.......")
Expand Down Expand Up @@ -81,7 +88,6 @@ def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, freqsTyp

my_feats = [m[0] for m in feat_list] # keeping only the features without the frequencies
myTexts = fex.get_counts(myTexts, feat_list=my_feats, feats=feats, n=n, freqsType=freqsType)

if embedding:
print(".......embedding counts.......")
myTexts, my_feats = embed.get_embedded_counts(myTexts, my_feats, model, topn=neighbouring_size)
Expand Down
27 changes: 18 additions & 9 deletions superstyl/load_from_config.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
import json
import superstyl
import pandas as pd
import os
import glob

from superstyl.load import load_corpus

def load_corpus_from_config(config_path):
def load_corpus_from_config(config_path, is_test=False):
"""
Load a corpus based on a JSON configuration file.

Expand Down Expand Up @@ -55,7 +54,7 @@ def load_corpus_from_config(config_path):

# Get sampling parameters
sampling_params = config.get('sampling', {})

# Use the first feature to create the base corpus with sampling
feature_configs = config.get('features', [])
if not feature_configs:
Expand Down Expand Up @@ -87,9 +86,9 @@ def load_corpus_from_config(config_path):
'sampling': sampling_params.get('enabled', False),
'units': sampling_params.get('units', 'words'),
'size': sampling_params.get('sample_size', 3000),
'step': sampling_params.get('sample_step', None),
'step': sampling_params.get('step', None),
'max_samples': sampling_params.get('max_samples', None),
'samples_random': sampling_params.get('sample_random', False),
'samples_random': sampling_params.get('samples_random', False),
'keep_punct': feature_config.get('keep_punct', False),
'keep_sym': feature_config.get('keep_sym', False),
'no_ascii': feature_config.get('no_ascii', False),
Expand All @@ -115,6 +114,7 @@ def load_corpus_from_config(config_path):
# Check for feature list file
feat_list = None
feat_list_path = feature_config.get('feat_list')
print(feat_list_path)
if feat_list_path:
if feat_list_path.endswith('.json'):
with open(feat_list_path, 'r') as f:
Expand All @@ -133,9 +133,9 @@ def load_corpus_from_config(config_path):
'sampling': sampling_params.get('enabled', False),
'units': sampling_params.get('units', 'words'),
'size': sampling_params.get('sample_size', 3000),
'step': sampling_params.get('sample_step', None),
'step': sampling_params.get('step', None),
'max_samples': sampling_params.get('max_samples', None),
'samples_random': sampling_params.get('sample_random', False),
'samples_random': sampling_params.get('samples_random', False),
'keep_punct': config.get('keep_punct', False),
'keep_sym': config.get('keep_sym', False),
'no_ascii': config.get('no_ascii', False),
Expand All @@ -146,11 +146,17 @@ def load_corpus_from_config(config_path):
}

print(f"Loading {feature_name}...")

corpus, features = load_corpus(paths, feat_list=feat_list, **params)

# Store corpus and features
corpora[feature_name] = corpus
feature_lists[feature_name] = features

if feat_list is not None and is_test:
feature_lists[feature_name] = feat_list
else:
feature_lists[feature_name] = features


# Create a merged dataset
print("Creating merged dataset...")
Expand All @@ -170,6 +176,8 @@ def load_corpus_from_config(config_path):

# Add features from each corpus
for name, corpus in corpora.items():
single_feature = []

feature_cols = [col for col in corpus.columns if col not in ['author', 'lang']]

# Rename columns to avoid duplicates
Expand All @@ -181,8 +189,9 @@ def load_corpus_from_config(config_path):

# Add features to the combined list with prefixes
for feature in feature_lists[name]:
all_features.append((f"{name}_{feature[0]}", feature[1]))
single_feature.append((feature[0], feature[1]))

all_features.append(single_feature)
# Return the merged corpus and combined feature list
return merged, all_features

30 changes: 7 additions & 23 deletions superstyl/preproc/features_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def count_features(text, feats ="words", n = 1):
Get feature counts from a text (words, chars or POS n-grams, or affixes(+punct if keep_punct),
following Sapkota et al., NAACL 2015
:param text: the source text
:param feats: the type of feats: words, chars, POS (supported only for English), or affixes
:param feats: the type of features, one of 'words', 'chars', 'affixes, 'lemma', 'pos', 'met_line' and 'met_syll'.
:param n: the length of n-grams
:return: features absolute frequencies in text as a counter, and the total of frequencies
"""
Expand All @@ -20,9 +20,9 @@ def count_features(text, feats ="words", n = 1):
raise ValueError("Text cannot be empty.")
if n < 1 or not isinstance(n, int):
raise ValueError("n must be a positive integer.")
if feats not in ["words", "chars", "affixes", "pos", "met_line", "met_syll"]:
raise ValueError("Unsupported feature type. Choose from 'words', 'chars', 'affixes', 'met_line', 'met_syll', or 'pos'.")
if feats == "words":
if feats not in ["words", "chars", "affixes", "lemma", "pos", "met_line", "met_syll"]:
raise ValueError("Unsupported feature type. Choose from 'words', 'chars', 'affixes', 'met_line', 'met_syll', 'lemma' or 'pos'.")
if feats in ("words", "lemma", "pos"):
tokens = nltk.tokenize.wordpunct_tokenize(text)
if n > 1:
tokens = ["_".join(t) for t in list(nltk.ngrams(tokens, n))]
Expand All @@ -46,20 +46,6 @@ def count_features(text, feats ="words", n = 1):
]
tokens = affs + space_affs_and_punct

#POS in english with NLTK - need to propose spacy later on
elif feats == "pos":
try:
nltk.data.find('taggers/averaged_perceptron_tagger_eng')
except:
nltk.download('averaged_perceptron_tagger_eng')
words = nltk.tokenize.wordpunct_tokenize(text)
pos_tags = [pos for word, pos in nltk.pos_tag(words)]
if n > 1:
tokens = ["_".join(t) for t in list(nltk.ngrams(pos_tags, n))]
else:
tokens = pos_tags
total = len(tokens)

elif feats == "met_line":
tokens = text.split()
if n > 1:
Expand All @@ -73,7 +59,7 @@ def count_features(text, feats ="words", n = 1):

#Adding an error message in case some distracted guy like me would enter something wrong:
else:
raise ValueError("Unsupported feature type. Choose from 'words', 'chars', 'affixes', 'met_line', 'met_syll' or 'pos'.")
raise ValueError("Unsupported feature type. Choose from 'words', 'chars', 'affixes', 'met_line', 'met_syll', 'lemmas' or 'pos'.")

counts = Counter()
counts.update(tokens)
Expand Down Expand Up @@ -108,7 +94,7 @@ def get_feature_list(myTexts, feats="words", n=1, freqsType="relative"):
"""
:param myTexts: a 'myTexts' object, containing documents to be processed
:param feat_list: a list of features to be selected
:param feats: type of feats (words, chars, affixes, POS, met_line, or met_syll)
:param feats: the type of features, one of 'words', 'chars', 'affixes, 'lemma', 'pos', 'met_line' and 'met_syll'.
:param freqsType: "relative", "absolute" or "binary" frequencies
:param n: n-grams length
:return: list of features, with total frequency
Expand Down Expand Up @@ -142,14 +128,12 @@ def get_doc_frequency(myTexts):
return feats_doc_freq




def get_counts(myTexts, feat_list=None, feats = "words", n = 1, freqsType = "relative"):
"""
Get counts for a collection of texts
:param myTexts: the document collection
:param feat_list: a list of features to be selected (None for all)
:param feats: the type of feats (words, chars, affixes, POS)
:param feats: the type of features, one of 'words', 'chars', 'affixes, 'lemma', 'pos', 'met_line' and 'met_syll'.
:param n: the length of n-grams
:param freqsType: relative, absolute or binarised freqs
:return: the collection with, for each text, a 'wordCounts' dictionary
Expand Down
Loading