Python-spider/getQuestion_topic at master · kozora/Python-spider · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#-*- coding: utf-8 -*-
__author__ = 'Wu_cf'
'''
获得一个话题下面的所有答案
1:话题url
2:http://www.zhihu.com/topic/19575436/questions?page=1
  http://www.zhihu.com/topic/' + link_id + '/questions?page=' + str(i)'''
from auth import islogin
from auth import Logging
import requests,cookielib,sys,urllib,urllib2,os,json,re,threading,Queue,time
from bs4 import BeautifulSoup
basePath = r'F:\Catch_IMGS'
requests = requests.Session()
requests.cookies = cookielib.LWPCookieJar('cookies')
try:
    requests.cookies.load(ignore_discard=True)
except:
    print u"尚未登录知乎=="
if islogin() != True:
    print u"请重新登录=="
#字符编码设置
reload(sys)
sys.setdefaultencoding('utf8')

class GetTopics:
    def __init__(self,url):
        global pagehtml
        self.url = url
        r = requests.get(self.url)
        #print r.url
        self.soup = BeautifulSoup(r.content,"lxml")
        pagehtml = r.content
        #print r.content
        print r.status_code
    def test(self):
        testtitle = self.soup.find("title").string
        print testtitle
        return self
    def get_foucustopic_num(self):
        num = int(self.soup.find("span",class_="follow-topics-count").string)
        print u"关注了",num,u"个话题"
        return num
    def getAll_topic_link_name(self):
        answers_num = self.get_foucustopic_num()
        topic_counter = 0
        getallquestion = GetQuestions()
        for i in xrange((answers_num - 1) / 20 + 1):
                if i == 0:
                   print "=======",i
                   topic_items = re.findall(r'<a class="topic-item-title-link" href="/topic/(.*?)">(.*?)</a>', pagehtml, re.S)

                   for topic in topic_items:
                        link = topic[0]
                        name = topic[1]
                        #topic_num = topic_num +1
                        print link,name,topic_counter
                        topic_counter = topic_counter + 1
                        getallquestion.get_all_top_questions(name,link)
                else:
                    print '=======',i
                    post_url = "http://www.zhihu.com/topic"
                    urls = "http://www.zhihu.com/"
                    r = requests.get(urls)
                    results = re.compile(r"\<input\stype=\"hidden\"\sname=\"_xsrf\"\svalue=\"(\S+)\"", re.DOTALL).findall(r.text)
                    _xsrf = results[0]

                    page_num = i + 1

                    data = {
                        '_xsrf': _xsrf,
                        'page': page_num
                    }
                    header = {
                        'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0",
                        'Host': "www.zhihu.com",
                        'Referer': self.url
                    }
                    r = requests.post(post_url, data=data, headers=header)

                    answer_list = r.json()["msg"]

                    for j in xrange(min(answers_num - i * 20, 20)):
                        topic_items = re.findall(r'<a class="topic-item-title-link" href="/topic/(.*?)">(.*?)</a>', answer_list[j], re.S)
                        for topic in topic_items:
                            link = topic[0]
                            name = topic[1]
                            print link,name,topic_counter
                            getallquestion.get_all_top_questions(name,link)
                            topic_counter = topic_counter + 1
        print topic_counter
        return self
class GetQuestions:
    '''1：根据话题来获得问题
       话题要不要存入进数据库或者其他什么方式
       3:http://www.zhihu.com/topic/19550523/top-answers?page=2
    '''
    def get_all_top_questions(self,name,link):
        #每隔话题爬10个页面吧
        question_num = 0
        default_url = 'http://www.zhihu.com/topic/' + link + '/top-answers'
        #question_url = 'http://www.zhihu.com/topic/' + link + '/top-answers?page=' + str(i)
        print u"话题为",name
        for i in xrange(10):

            if i == 0:
                print "=====页面",i
                try:
                    r = requests.get(default_url)
                except Exception as e:
                    pass
                question_items = re.findall(r'<h2><a class="question_link" target="_blank" href=(.*?)>(.*?)</a></h2>', r.content, re.S)
                for question in question_items:
                    question_num = question_num + 1
                    print question[1]," ",question_num

            else:
                print "=====页面",i
                changed_url = 'http://www.zhihu.com/topic/' + link + '/top-answers?page=' + str(i)
                try:
                    r = requests.get(changed_url)
                except Exception as e:
                    pass
                question_items = re.findall(r'<h2><a class="question_link" target="_blank" href=(.*?)>(.*?)</a></h2>', r.content, re.S)
                for question in question_items:
                    question_num = question_num + 1
                    print question[1]," ",question_num
def main():
   url = "http://www.zhihu.com/topic"
   gettopics = GetTopics(url)
   test = gettopics.test()
   gettopicsnum = gettopics.get_foucustopic_num()
   gettopicname = gettopics.getAll_topic_link_name()
if __name__ == '__main__':
    main()