-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathExtractFeatures.py
More file actions
145 lines (105 loc) · 4.52 KB
/
ExtractFeatures.py
File metadata and controls
145 lines (105 loc) · 4.52 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import csv
import nltk
from EmailParser import EmailParser
from nltk.corpus import stopwords
import random
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.linear_model import LogisticRegression
class learn_meeting_model:
"""
This class has member functions to read data from a 'csv' file, create feature sets out of it,
train a logistic regression classifier on the featuresets and print the cross validated results.
"""
def __init__(self,filename,level=1):
"""filename (for e.g., <location_data.csv> )is passed during instantiation of the class.
level is the number of self-contained mails that has to be considered for training
"""
self.filename = filename
data_table = self.read_csv() #reads the datafile
featuresets = self.create_featuresets(data_table,level) #creates feature sets
self.learn_model(featuresets) #trains and tests a classifier
def read_csv(self):
"""
uses the csv package to read the data from csv file.
stores the data internally in the form of list of lists
"""
data_table = []
with open(self.filename,'rb') as f:
reader = csv.reader(f)
for row in reader:
data_table.append(row)
return data_table
def extract_mail(self,data_table,level=1):
"""
this function processes the data_table row by row and for each row(i.e. each mail) extracts the contained mail
(upto the mentioned level)
Returns two variables : cleaned_data and all_words
cleaned_data : a list of lists in the form [[(set of unique words in a mail),label of mail]] --> extracts only the mailtext
and label from the original data_table read from the csv file
all_words : the list containing all words across all mails for the purpose of finding out the most frequent words
in the process of feature set creation
"""
cleaned_data = []
all_words = []
for row in data_table:
mailtext = row[-2]
category = row[-1]
mailParser = EmailParser()
mails= mailParser.tokenize_mail(mailtext) #gets the contained mails within each mail
row_words = set()
for i in range(level): #level of nesting of replies to be considered for feature creation
mail_words = nltk.word_tokenize(mails[i].decode("utf8"))
for word in mail_words:
row_words.add(word) #adding the words in current mail to a set to get unique tokens
all_words.append(word) #cumulating the words for frequency counting
cleaned_data.append([row_words,category]) #each item in cleaned data is [<set of words in mail>, label]
return all_words,cleaned_data
def remove_stopwords(self,all_words):
"""
Before using the most frequent words as features, it is processed to remove
the stop words taken from nltk.corpus
"""
stop_words = stopwords.words('english')
new_words = []
for word in all_words:
if word not in stop_words:
new_words.append(word)
return new_words
def find_features(self,words,word_features):
"""
this function creates a feature vector(in the form of a dict) for each mail
Each unique token in the mail is checked for it's presence in the word_features (most frequent words identified
as features)
"""
features = {}
for w in word_features:
features[w] = (w in words)
return features
def create_featuresets(self,data_table,level=1):
"""
Forms the cleaned data out of orginal data_table
Creates 'all_words' which is a collection of all words across all mails in tha data_table
Removes stopwords from the 'all_words'
Identifies the word_features based on frequency
Creates and returns the featuresets for mails in the cleaned_data
"""
all_words,cleaned_data = self.extract_mail(data_table,level)
all_words = self.remove_stopwords(all_words)
all_words = nltk.FreqDist(all_words)
word_features = list(all_words.keys())[:2000]
featuresets = [(self.find_features(mail,word_features),label) for (mail,label) in cleaned_data]
return featuresets
def learn_model(self,featuresets):
"""
trains and tests the logistic regression classifier on the data
"""
random.shuffle(featuresets)
limit = int(0.75*len(featuresets)) #partitioning 3:1 for train:test
train_set = featuresets[:limit]
test_set = featuresets[limit:]
lr_classifier = SklearnClassifier(LogisticRegression())
lr_classifier.train(train_set)
print 'Logistic classifier Accuracy : ',str(nltk.classify.accuracy(lr_classifier,test_set)*100)
#nb_classifier = nltk.NaiveBayesClassifier.train(train_set)
#print 'Naive Bayes classifier Accuracy : ',str(nltk.classify.accuracy(nb_classifier,test_set)*100)
#nb_classifier.show_most_informative_features(30)