-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathexecPy
More file actions
executable file
·88 lines (63 loc) · 2.35 KB
/
execPy
File metadata and controls
executable file
·88 lines (63 loc) · 2.35 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/usr/bin/env python
import argparse
import html2text # for converting to readable text
import urllib2 #for grabbing html from page
class TagExtractor:
toc = None
tags = []
html = None
def __init__(self,html):
self.html = str(html)
def extractToc(self):
parse_point1 = "<div id=" + '"' + "toc" + '"' + " class=" + '"' + "toc" + '"' + ">"
#parse_point1 = "<div id ="
parse_point2 = "<p>"
i = self.html.find(parse_point1,0)
j = self.html.find(parse_point2,i)
print(i,j)
self.toc = self.html[i:j]
return self.html[i:j]
def extractSection(self, section_header):
parse_point1 = "<h2><span class=" + '"' + "mw-headline" + '"' + " id=" + '"' + section_header + '"' + ">" + section_header.replace("_"," ") + "</span></h2>"
print(parse_point1)
#parse_point1 = "<div id ="
parse_point2 = "<h2>"
i = self.html.find(parse_point1,0)
j = self.html.find(parse_point2,i+1)
print(i,j)
return self.html[i:j]
parser = argparse.ArgumentParser()
parser.add_argument("wikipage")
#optional argument
parser.add_argument("--ignorelinks", help = "ignores the links", action = "store_true")
parser.add_argument("--fullpage", help = "prints full page instead of just TOC", action = "store_true")
parser.add_argument("--section", metavar="", dest="section", default="", help="input a section you'd like to retrieve. If nothing inputted, will return the table of contents")
args = parser.parse_args()
page = args.wikipage.replace(" ","_")
section = args.section.replace(" ","_")
language = "en"
query_string = "https://" + language + ".wikipedia.org/wiki/" + page
print(query_string)
import time
current_time = lambda:int(round(time.time() * 1000))
pre_fetch = current_time()
#this just grabs all the html at once
page_response = urllib2.urlopen(query_string)
page_html = page_response.read()
post_fetch = current_time()
print("time to fetch: " + str(post_fetch-pre_fetch) + "ms")
extract = TagExtractor(page_html)
toc_html = extract.extractToc()
section_html = extract.extractSection(section)
conv = html2text.HTML2Text()
#if(args.ignorelinks):
#conv.ignore_links = True
page_text = conv.handle(page_html.decode('utf8'))
toc_text = conv.handle(toc_html.decode('utf8'))
section_text = conv.handle(section_html.decode('utf8'))
with open("html.txt","w") as f:
f.write(page_html)
if(section == None or section == ""):
print(toc_text)
else:
print(section_text)