-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathWordsToNumbers.py
More file actions
124 lines (86 loc) · 3.5 KB
/
WordsToNumbers.py
File metadata and controls
124 lines (86 loc) · 3.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import pickle
import re
import string
from math import log
# Build a cost dictionary, assuming Zipf's law and cost = -math.log(probability).
words = open("words_by_frequency.txt").read().split()
# words = pickle.load(open("words", 'rb'))
wordcost = dict((k, log((i+1)*log(len(words)))) for i, k in enumerate(words))
maxword = max(len(x) for x in words)
def camel_case_split(identifier):
matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier)
return [m.group(0) for m in matches]
def infer_spaces(s):
"""Uses dynamic programming to infer the location of spaces in a string
without spaces."""
# Find the best match for the i first characters, assuming cost has
# been built for the i-1 first characters.
# Returns a pair (match_cost, match_length).
def best_match(i):
candidates = enumerate(reversed(cost[max(0, i-maxword):i]))
return min((c + wordcost.get(s[i-k-1:i], 9e999), k+1) for k,c in candidates)
# Build the cost array.
cost = [0]
for i in range(1,len(s)+1):
c, k = best_match(i)
cost.append(c)
# Backtrack to recover the minimal-cost string.
out = []
i = len(s)
while i>0:
c,k = best_match(i)
assert c == cost[i]
out.append(str(s[i-k:i]))
i -= k
return reversed(out)
def sentence_to_word_array(sentence="", remove_punctuation=True, to_lowercase=True, should_infer_spaces=True, should_camel_case_split=True):
return_array = list()
split = list()
if remove_punctuation:
split = filter(None, re.split("[,\W\x03\x07\x0b\n\t\r.()' \-!?:/#@*^%&$<>;\"`~{}-]+", sentence))
for word in split:
if should_camel_case_split:
inferred = camel_case_split(word.lower())
for i in inferred:
if len(i) > 2 and not i.lower() in return_array:
return_array.append(i.lower())
if remove_punctuation:
word = word.translate(string.punctuation)
if should_infer_spaces:
inferred = infer_spaces(word.lower())
for i in inferred:
if len(i) > 2 and not i.lower() in return_array:
return_array.append(i.lower())
if len(word) > 1 and not i.lower() in return_array:
return_array.append(word.lower())
return return_array
def words_to_numbers(input_matrix=[[]], file_to_write=""):
return_matrix = list()
words = dict()
current_count = 1 # start from 1
for i in range(len(input_matrix)):
row = input_matrix[i]
return_matrix.append(list())
for k in range(len(row)):
col = row[k]
if col not in words:
words[col] = current_count
current_count += 1
return_matrix[i].append(words[col])
if file_to_write != "":
pickle.dump(words, open(file_to_write, 'wb'))
return return_matrix, words
def words_to_numbers_from_old_words_dict(input_matrix=[[]], words=dict(), unk_integer=-1, file_to_read=""):
if file_to_read != "":
words = pickle.load(open(file_to_read, 'rb'))
return_matrix = list()
for i in range(len(input_matrix)):
row = input_matrix[i]
return_matrix.append(list())
for k in range(len(row)):
col = row[k]
value = unk_integer
if col in words:
value = words[col]
return_matrix[i].append(value)
return return_matrix