-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathutils.py
More file actions
139 lines (124 loc) · 4.18 KB
/
utils.py
File metadata and controls
139 lines (124 loc) · 4.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import pandas as pd
import requests
from bs4 import BeautifulSoup
BADS = ['.', ',', '(', ')', ':', ';', '?']
PBADS = ["&", "amp;", ";", '\n']
def fix_bads(page):
"""
Prepares html for parsing
:param page: raw data
:return: fixed html
"""
page = page.replace(""", '"')
page = page.replace("&lt;", "<")
page = page.replace("&gt;", ">")
page = page.replace(">", ">")
page = page.replace("<", "<")
page = page.replace(""", '"')
return page
def convert_string(text):
"""
Converts string from ascii to utf
:param text:
:return:
"""
for rec in PBADS:
text = text.replace(rec, '')
splt = text.split(sep=" ")
out_t = ""
for entry in splt:
for rec in BADS:
entry = entry.replace(rec, '')
entry = entry.replace("'", '"')
# Replacing '-' in names
if '#1' in entry:
entry = entry.replace('-', '')
w_splt = entry.split(sep="#")
out_t += convert_to_utf(w_splt[1:]) + " "
if w_splt[0] != '':
out_t += w_splt[0] + " "
return out_t
def convert_to_utf(word):
"""
Converts word to utf
:param word:
:return:
"""
w = ""
for a in word:
w += chr(int(a))
return w
def get_problem(page):
"""
Get question
:param page: prepared html page
:return:
"""
soup = BeautifulSoup(page, "html.parser")
convert = soup.find_all(class_="problem")
out_data = []
for i, entry in enumerate(convert):
pr_txt = entry.find_all(class_="response-fieldset-legend")
answers = entry.find_all(class_="response-label field-label label-inline")
answers_c = entry.find_all(class_="response-label field-label label-inline choicegroup_correct")
try:
out_data.append({'id': i,
'problem': convert_string(pr_txt[0].get_text()),
'answers': process_answers(answers + answers_c)})
except:
continue
return out_data
def process_answers(answers):
"""
Finds correct answers
:param answers:
:return:
"""
answer_arr = []
for answer in answers:
inp = answer.find_all("input")
try:
if inp[0]['checked'] == 'true':
if "&" in answer.get_text():
txt = ""
try:
txt = convert_string(answer.get_text())
except:
pass
answer_arr.append(txt + " " +
answer.get_text().split(sep="&")[0].replace("'", '"'))
except KeyError:
pass
# Да, это костыль. Ну и что?
while len(answer_arr) < 4:
answer_arr.append("—")
return answer_arr
def authorizer(username, password, URL='https://sso.openedu.ru/login/', next_page='/oauth2/authorize%3Fstate%3DYpbWrm0u6VoE6nOvTi47PQLaC5CB5ZFJ%26redirect_uri%3Dhttps%3A//openedu.ru/complete/npoedsso/%26response_type%3Dcode%26client_id%3D808f52636759e3616f1a%26auth_entry%3Dlogin'):
"""
Authorizes and return request.session
:param username: uname
:param password: pwd
:param URL: login url
:param next_page: redirect url
:return: request.session with all tokens
"""
client = requests.session()
csrf = client.get(URL).cookies['csrftoken']
login_data = dict(username=username, password=password, csrfmiddlewaretoken=csrf, next=next_page)
r = client.post(URL, data=login_data, headers=dict(Referer=URL))
return client
def save_to_excel(data, filename="out.xlsx"):
"""
Saving data to excel
:param data: JSON from get_problem
:param filename: name of excel file
:return:
"""
df = pd.DataFrame(columns=['question', 'answer1', 'answer2', 'answer3', 'answer4'])
for rec in data:
df = df.append({'question': rec['problem'],
'answer1': rec['answers'][0],
'answer2': rec['answers'][1],
'answer3': rec['answers'][2],
'answer4': rec['answers'][3]}, ignore_index=True)
df.to_excel(filename)