wikipedia-cli/execPy at master · alexanderbenfox/wikipedia-cli · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/usr/bin/env python

import argparse
import html2text # for converting to readable text
import urllib2 #for grabbing html from page

class TagExtractor:

	toc = None
	tags = []
	html = None

	def __init__(self,html):
		self.html = str(html)
	def extractToc(self):
		parse_point1 = "<div id=" + '"' + "toc" + '"' + " class=" + '"' + "toc" + '"' + ">"
		#parse_point1 = "<div id ="
		parse_point2 = "<p>"

		i = self.html.find(parse_point1,0)
		j = self.html.find(parse_point2,i)

		print(i,j)
		self.toc = self.html[i:j]
		return self.html[i:j]
	def extractSection(self, section_header):
		parse_point1 = "<h2><span class=" + '"' + "mw-headline" + '"' + " id=" + '"' + section_header + '"' + ">" + section_header.replace("_"," ") + "</span></h2>"
		print(parse_point1)
		#parse_point1 = "<div id ="
		parse_point2 = "<h2>"

		i = self.html.find(parse_point1,0)
		j = self.html.find(parse_point2,i+1)

		print(i,j)

		return self.html[i:j]

parser = argparse.ArgumentParser()
parser.add_argument("wikipage")
#optional argument
parser.add_argument("--ignorelinks", help = "ignores the links", action = "store_true")
parser.add_argument("--fullpage", help = "prints full page instead of just TOC", action = "store_true")
parser.add_argument("--section", metavar="", dest="section", default="", help="input a section you'd like to retrieve.  If nothing inputted, will return the table of contents")
args = parser.parse_args()

page = args.wikipage.replace(" ","_")
section = args.section.replace(" ","_")
language = "en"


query_string = "https://" + language + ".wikipedia.org/wiki/" + page
print(query_string)

import time
current_time = lambda:int(round(time.time() * 1000))

pre_fetch = current_time()

#this just grabs all the html at once
page_response = urllib2.urlopen(query_string)

page_html = page_response.read()
post_fetch = current_time()
print("time to fetch: " + str(post_fetch-pre_fetch) + "ms")


extract = TagExtractor(page_html)
toc_html = extract.extractToc()
section_html = extract.extractSection(section)

conv = html2text.HTML2Text()
#if(args.ignorelinks):
#conv.ignore_links = True

page_text = conv.handle(page_html.decode('utf8'))
toc_text = conv.handle(toc_html.decode('utf8'))
section_text = conv.handle(section_html.decode('utf8'))

with open("html.txt","w") as f:
	f.write(page_html)

if(section == None or section == ""):
	print(toc_text)
else:
	print(section_text)