forked from hegugugugu/zhihuSpider
-
Notifications
You must be signed in to change notification settings - Fork 0
/
zhihuSpider.py
98 lines (81 loc) · 3.4 KB
/
zhihuSpider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 12 14:49:04 2022
@author: maotuo
"""
import pandas as pd
import time
import requests
import execjs
import hashlib
import urllib
import urllib.parse
from bs4 import BeautifulSoup
userId = [] # 保存用户id
userTime = [] # 保存用户发表评论的时间
userName = [] # 保存用户姓名
userContent = [] # 保存用户回答内容
userComment = [] # 保存该回答的评论数
userLike = [] # 保存该回答的赞同数
totals = [] # 记录回答的总条数
def zhuhuSipder(page):
url = '/api/v4/questions/267209533/answers?'
params = {
'include':'data[*].is_normal,admin_closed_comment,reward_info,is_collapsed,annotation_action,annotation_detail,collapse_reason,is_sticky,collapsed_by,suggest_edit,comment_count,can_comment,content,editable_content,attachment,voteup_count,reshipment_settings,comment_permission,created_time,updated_time,review_info,relevant_info,question,excerpt,is_labeled,paid_info,paid_info_content,reaction_instruction,relationship.is_authorized,is_author,voting,is_thanked,is_nothelp,is_recognized;data[*].mark_infos[*].url;data[*].author.follower_count,vip_info,badge[*].topics;data[*].settings.table_of_content.enabled',
'offset':str(page),
'limit':'5',
'sort_by':'default',
'platform':'desktop',
}
s = "+".join(["101_3_2.0", url+urllib.parse.urlencode(params), '"cookie的d_c0值"'])
fmd5 = hashlib.new('md5', s.encode()).hexdigest()
with open('g_encrypt.js', 'r') as s:
ctx1 = execjs.compile(s.read(), cwd=r'Nodejs的node_modules路径')
encrypt_str = ctx1.call('b', fmd5)
headers = {
"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.88 Safari/537.36",
"cookie": 'd_c0="cookie的d_c0值";',
"x-api-version": "3.0.91",
"x-zse-93": "101_3_2.0",
"x-zse-96": "2.0_%s" % encrypt_str,
}
response = requests.get("https://www.zhihu.com" + url, headers=headers,params=params).json()
def timestamp_to_date(time_stamp, format_string="%Y-%m-%d %H:%M:%S"):
time_array = time.localtime(time_stamp)
str_date = time.strftime(format_string, time_array)
return str_date
for i in response["data"]:
id = i['author']['id']
name = i['author']['name']
content = i['content']
like = i['voteup_count']
comment = i['comment_count']
soup = BeautifulSoup(content,'lxml')
line = soup.get_text()
time_ = timestamp_to_date(i["created_time"])
# print(name + str(id) + ":" + comment)
userId.append(id)
userTime.append(time_)
userName.append(name)
userContent.append(line)
userComment.append(comment)
userLike.append(like)
totals_ = response["paging"]["totals"] # 回答总条数
totals.append(totals_)
return totals[0]
def mulitypage():
page = 0
zhuhuSipder(page)
time.sleep(10)
while (page < totals[0]):
print("正在抓取第{}页".format(int(page / 5)))
page += 5
zhuhuSipder(page)
# 保存数据
def savedata():
v = list(zip(userId, userTime, userName, userLike, userComment,userContent))
# print(v)
pd.DataFrame(v, columns=["id", "time", "name", "like", "comment", "cotent"]).to_excel("cat.xlsx")
if __name__ == "__main__":
mulitypage()
savedata()