forked from Jackevansevo/Udacity-Python-Code
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathwebcrawler.py
More file actions
113 lines (99 loc) · 4.14 KB
/
webcrawler.py
File metadata and controls
113 lines (99 loc) · 4.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# The web crawler we built at the end of Unit 3 has some serious
# flaws if we were going to use it in a real crawler. One
# problem is if we start with a good seed page, it might
# run for an extremely long time (even forever, since the
# number of URLS on the web is not actually finite). This
# question and the following one explore two different ways
# to limit the pages that it can crawl.
# Modify the crawl_web procedure to take a second parameter,
# max_pages, that limits the number of pages to crawl.
# Your procedure should terminate the crawl after
# max_pages different pages have been crawled, or when
# there are no more pages to crawl.
# The following definition of get_page provides an interface
# to the website found at http://www.udacity.com/cs101x/index.html
# The function output order does not affect grading.
def get_page(url):
try:
if url == "http://www.udacity.com/cs101x/index.html":
return ('<html> <body> This is a test page for learning to crawl! '
'<p> It is a good idea to '
'<a href="http://www.udacity.com/cs101x/crawling.html">learn to '
'crawl</a> before you try to '
'<a href="http://www.udacity.com/cs101x/walking.html">walk</a> '
'or <a href="http://www.udacity.com/cs101x/flying.html">fly</a>. '
'</p> </body> </html> ')
elif url == "http://www.udacity.com/cs101x/crawling.html":
return ('<html> <body> I have not learned to crawl yet, but I '
'am quite good at '
'<a href="http://www.udacity.com/cs101x/kicking.html">kicking</a>.'
'</body> </html>')
elif url == "http://www.udacity.com/cs101x/walking.html":
return ('<html> <body> I cant get enough '
'<a href="http://www.udacity.com/cs101x/index.html">crawling</a>! '
'</body> </html>')
elif url == "http://www.udacity.com/cs101x/flying.html":
return ('<html> <body> The magic words are Squeamish Ossifrage! '
'</body> </html>')
except:
return ""
return ""
def get_next_target(page):
start_link = page.find('<a href=')
if start_link == -1:
return None, 0
start_quote = page.find('"', start_link)
end_quote = page.find('"', start_quote + 1)
url = page[start_quote + 1:end_quote]
return url, end_quote
def union(p,q):
for e in q:
if e not in p:
p.append(e)
def get_all_links(page):
links = []
while True:
url,endpos = get_next_target(page)
if url:
links.append(url)
page = page[endpos:]
else:
break
return links
def crawl_web(seed,max_pages):
tocrawl = [seed]
crawled = []
next_deph = []
depth = 0
while tocrawl and depth <= max_depth:
page = tocrawl.pop()
if page not in crawled:
union(next_depth, get_all_links(get_page(page)))
crawled.append(page)
if not tocrawl:
tocrawl, next_depth = next_depth, []
depth = depth + 1
return crawled
print crawl_web("http://www.udacity.com/cs101x/index.html",0)
#>>> ['http://www.udacity.com/cs101x/index.html']
print crawl_web("http://www.udacity.com/cs101x/index.html",1)
#>>> ['http://www.udacity.com/cs101x/index.html',
#>>> 'http://www.udacity.com/cs101x/flying.html',
#>>> 'http://www.udacity.com/cs101x/walking.html',
#>>> 'http://www.udacity.com/cs101x/crawling.html']
print crawl_web("http://www.udacity.com/cs101x/index.html",50)
#>>> ['http://www.udacity.com/cs101x/index.html',
#>>> 'http://www.udacity.com/cs101x/flying.html',
#>>> 'http://www.udacity.com/cs101x/walking.html',
#>>> 'http://www.udacity.com/cs101x/crawling.html',
#>>> 'http://www.udacity.com/cs101x/kicking.html']
print crawl_web("http://top.contributors/forbiddenvoid.html",2)
#>>> ['http://top.contributors/forbiddenvoid.html',
#>>> 'http://top.contributors/graemeblake.html',
#>>> 'http://top.contributors/angel.html',
#>>> 'http://top.contributors/dreyescat.html',
#>>> 'http://top.contributors/johang.html',
#>>> 'http://top.contributors/charlzz.html']
print crawl_web("A1",3)
#>>> ['A1', 'C1', 'B1', 'E1', 'D1', 'F1']
# (May be in any order)