OpenEParse/utils.py at master · Dostoyewski/OpenEParse · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import pandas as pd
import requests
from bs4 import BeautifulSoup

BADS = ['.', ',', '(', ')', ':', ';', '?']
PBADS = ["&amp;", "amp;", ";", '\n']


def fix_bads(page):
    """
    Prepares html for parsing
    :param page: raw data
    :return: fixed html
    """
    page = page.replace("&amp;#34;", '"')
    page = page.replace("&amp;lt;", "<")
    page = page.replace("&amp;gt;", ">")
    page = page.replace("&gt;", ">")
    page = page.replace("&lt;", "<")
    page = page.replace("&#34;", '"')
    return page


def convert_string(text):
    """
    Converts string from ascii to utf
    :param text:
    :return:
    """
    for rec in PBADS:
        text = text.replace(rec, '')
    splt = text.split(sep=" ")
    out_t = ""
    for entry in splt:
        for rec in BADS:
            entry = entry.replace(rec, '')
            entry = entry.replace("&#39", '"')
            # Replacing '-' in names
            if '#1' in entry:
                entry = entry.replace('-', '')
        w_splt = entry.split(sep="#")
        out_t += convert_to_utf(w_splt[1:]) + " "
        if w_splt[0] != '':
            out_t += w_splt[0] + " "
    return out_t


def convert_to_utf(word):
    """
    Converts word to utf
    :param word:
    :return:
    """
    w = ""
    for a in word:
        w += chr(int(a))
    return w


def get_problem(page):
    """
    Get question
    :param page: prepared html page
    :return:
    """
    soup = BeautifulSoup(page, "html.parser")
    convert = soup.find_all(class_="problem")
    out_data = []
    for i, entry in enumerate(convert):
        pr_txt = entry.find_all(class_="response-fieldset-legend")
        answers = entry.find_all(class_="response-label field-label label-inline")
        answers_c = entry.find_all(class_="response-label field-label label-inline choicegroup_correct")
        try:
            out_data.append({'id': i,
                             'problem': convert_string(pr_txt[0].get_text()),
                             'answers': process_answers(answers + answers_c)})
        except:
            continue
    return out_data


def process_answers(answers):
    """
    Finds correct answers
    :param answers:
    :return:
    """
    answer_arr = []
    for answer in answers:
        inp = answer.find_all("input")
        try:
            if inp[0]['checked'] == 'true':
                if "&amp;" in answer.get_text():
                    txt = ""
                    try:
                        txt = convert_string(answer.get_text())
                    except:
                        pass
                    answer_arr.append(txt + " " +
                                      answer.get_text().split(sep="&amp;")[0].replace("&#39", '"'))
        except KeyError:
            pass
    # Да, это костыль. Ну и что?
    while len(answer_arr) < 4:
        answer_arr.append("—")
    return answer_arr


def authorizer(username, password, URL='https://sso.openedu.ru/login/', next_page='/oauth2/authorize%3Fstate%3DYpbWrm0u6VoE6nOvTi47PQLaC5CB5ZFJ%26redirect_uri%3Dhttps%3A//openedu.ru/complete/npoedsso/%26response_type%3Dcode%26client_id%3D808f52636759e3616f1a%26auth_entry%3Dlogin'):
    """
    Authorizes and return request.session
    :param username: uname
    :param password: pwd
    :param URL: login url
    :param next_page: redirect url
    :return: request.session with all tokens
    """
    client = requests.session()
    csrf = client.get(URL).cookies['csrftoken']
    login_data = dict(username=username, password=password, csrfmiddlewaretoken=csrf, next=next_page)
    r = client.post(URL, data=login_data, headers=dict(Referer=URL))
    return client


def save_to_excel(data, filename="out.xlsx"):
    """
    Saving data to excel
    :param data: JSON from get_problem
    :param filename: name of excel file
    :return:
    """
    df = pd.DataFrame(columns=['question', 'answer1', 'answer2', 'answer3', 'answer4'])
    for rec in data:
        df = df.append({'question': rec['problem'],
                        'answer1': rec['answers'][0],
                        'answer2': rec['answers'][1],
                        'answer3': rec['answers'][2],
                        'answer4': rec['answers'][3]}, ignore_index=True)
    df.to_excel(filename)