BaconNumberVisualization/Graph_Crawler.py at master · TSTEP99/BaconNumberVisualization · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import urllib;
import pandas as pd;
from urllib.request import urlopen;
from bs4 import BeautifulSoup;
import sqlite3;
import json;


class queque:
	def __init__(self): #intilizes the list which the queque is implemented
		self.array=[];
	def pop(self):	#pop the first element
		item=self.array[0];
		del self.array[0];
		return item;
	def push(self,item): #pushes an element on the end
		self.array.append(item);
	def is_empty(self):	#return 0 if full 1 if the queque is empty
		if(len(self.array))==0:	#IS the array is empty returns 1
			return 1
		else:
			return 0; #returns 0 if otherwise


graph={};# gives actual graph to be displayed
url_queque= queque(); #makeas a queque for the links being searched.
url_checker={};# Makes sure url already visited is not visited again

with open('graph.json','r') as json_data: #makes sure to load previous json to help keep track of links placed in graph;
	graph=json.load(json_data);

#with open("url_checker.json","r") as json_data: #gets previous URLs that were checked
	#url_checker=json.load(json_data);


def is_integer(number):

	digits=[str(i) for i in range(10)]; #creates and array of

	try:
		int(number)#Sees if it can be converyted to an integer
		return True
	except ValueError:
		return False #returns false if it cannot or true if it can

def find_name(soup,type):

	if type=="A": #Says is an actor being passed
		name=soup.find_all("span",class_="itemprop",limit=1);# finds the first item on the span tag of itemprop class
		name=name[0].string;
		return name;
	elif type=="M": #If movie being passed
		name=soup.select('h1')[0].text.strip(); #selects based on h1 tag
		if len(name)>=6 and name[-1]==')' and name[-6]=='(' and is_integer(name[-5:-1]):
			name=name[:len(name)-7];
		return name;

def find_movies(soup):

	edges=[] # list of edges to be used for the graph when returned

	global url_checker; # gives function acess to url related local variables
	global url_queque;

	for link in soup.select("div.filmo-row"): #selectes the movies in the rows
		for item in link.find_all("a",href=True,limit=1): # further slects the html with the a tag
			if item.string != None:
				id=item["href"][len("/title/tt"):len("/title/tt")+7]; #gives id to the name of movie
				edges.append(item.string+id);
				url="https://www.imdb.com"+item["href"];
				if not url in url_checker: # makes sure no movie that have already been visited are visited again
					url_queque.push(url); #pushes onto queque if not
					url_checker[url]=1;
					with open("url_checker.json","w") as json_data:
						json.dump(url_checker,json_data);
						json_data.truncate();

	return edges;


def find_actors(soup):
	edges=[]; #list of edges to be used for the graph when returned

	global url_checker;# allows local variables to be used
	global url_queque;

	for tag in soup.find_all("td",class_=None):
		for actor in tag.find_all("a",href=True,limit=1):#FInds the a tag enclosed inside the td
			if actor.string != None:
				id=actor["href"][len("/name/nm"):len("/name/nm")+7]; #gets the number from the href attribute
				edges.append(actor.string[:-1]+id);
				url="https://www.imdb.com"+actor["href"];
				if not url in url_checker:
					url_queque.push(url);
					url_checker[url]=1;
					with open("url_checker.json","w") as json_data:
						json.dump(url_checker,json_data);
						json_data.truncate();


	return edges;


def spider():
	global graph;
	global url_queque;

	while url_queque.is_empty()!=1: #Checks if queque is empyty or not

		try:
			url=url_queque.pop();
			response=urlopen(url);
			html=response.read();
			soup=BeautifulSoup(html,'html.parser');
		except:
			continue;


		if "https://www.imdb.com/name/" in url:
			name=find_name(soup,"A");
			name=name+url[len("https://www.imdb.com/name/nm"):len("https://www.imdb.com/name/nm")+7];#attaches the id to the name
			print(name)
			edges=find_movies(soup);
			#print(edges);
			graph[name]=edges;
			with open("graph.json","w") as json_data:
				json.dump(graph,json_data);
				json_data.truncate();


		elif "https://www.imdb.com/title/" in url:
			name=find_name(soup,"M");
			name=name+url[len("https://www.imdb.com/title/tt"):len("https://www.imdb.com/title/tt")+7];
			print(name)
			edges=find_actors(soup);
			#print(edges);
			graph[name]=edges;
			with open("graph.json","w") as json_data:
				json.dump(graph,json_data);
				json_data.truncate();


url="https://www.imdb.com/name/nm0000102";
url_queque.push(url);
url_checker[url]=1;
response=urlopen(url);
html=response.read();
soup=BeautifulSoup(html,'html.parser');
spider();