-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfrequency.py
More file actions
executable file
·77 lines (43 loc) · 1.34 KB
/
frequency.py
File metadata and controls
executable file
·77 lines (43 loc) · 1.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
__author__ = 'cs'
import sys
import json
def load_and_get_tweetfile(tweet_file):
tweets = []
with tweet_file as f:
for line_str in f:
tweets.append(json.loads(line_str))
return tweets
def preprocess_tweet(tweet):
punc = (",./;'?&-#!@")
splitList = []
stripedList = []
if "text" in tweet:
splitList = str.split((tweet['text'].encode('ascii', 'replace')).lower().translate(None, punc))
for item in splitList:
stripedList.append(item.strip())
return stripedList
def get_preprocessed_tweet_list(tweets):
tweetList = []
for tweet in tweets:
tweetList.append(preprocess_tweet(tweet))
#print str(len(tweetList))
return tweetList
def get_term_count_dict(tweets):
result = {}
for tweet in tweets:
for term in tweet:
if term in result:
result[term] += 1
else:
result[term] = 1
return result
def main():
tweets = load_and_get_tweetfile(open(sys.argv[1]))
tweet_text_list = get_preprocessed_tweet_list(tweets)
terms = get_term_count_dict(tweet_text_list)
overall_term_count = sum(terms.values())
#print overall_term_count
for term in terms:
print term + " " + str(float(terms[term]) / overall_term_count)
if __name__ == '__main__':
main()