首先,知乎获取回答的接口是这样的
https://www.zhihu.com/api/v4/questions/348488122/answers?include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%3Bdata%5B%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%2A%5D.topics&limit=5&offset=0&platform=desktop&sort_by=default
找到接口之后只要模拟知乎的请求头发送http请求就行了,也不需要任何验证,questions后面是问题的id,然后请求中的offset是指定获取第几条回答,其它的参数我不知道是什么意思,也用不到
下面就是用python写的完整的代码了,直接就能用
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import json
import requests
import time
import sys
import mysql.connector
questionID = sys.argv[1]
# create database table
db = mysql.connector.connect(
host="localhost",
user="root",
passwd="1234",
database="zhihu"
)
def checkTableExists(dbcon, tablename):
dbcur = dbcon.cursor()
dbcur.execute("""
SELECT COUNT(*)
FROM information_schema.tables
WHERE table_name = '{0}'
""".format(tablename.replace('\'', '\'\'')))
if dbcur.fetchone()[0] == 1:
dbcur.close()
return True
dbcur.close()
return False
tableName = questionID
if not checkTableExists(db, tableName):
print("CREATE NEW TABLE")
mycursor = db.cursor()
mycursor.execute('''
CREATE TABLE `%s` (
`ID` int(11) NOT NULL AUTO_INCREMENT PRIMARY KEY,
`author_name` text NOT NULL,
`voteup_count` int(11) NOT NULL,
`url_token` longtext NOT NULL,
`avatar_url` longtext NOT NULL,
`content` longtext NOT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
''' % (int(tableName)))
else:
print("TABLE EXISTS!")
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) '
'AppleWebKit/537.36 (KHTML, '
'like Gecko) Chrome/67.0.3396.99 '
'Safari/537.36',
'Host': "www.zhihu.com",
'Referer': "https://www.zhihu.com/question/" + str(questionID)
}
def answer(url_):
r = requests.get(url_, headers=header)
data = r.text
jsonobj = json.loads(data)
return jsonobj
url = "https://www.zhihu.com/api/v4/questions/" + str(
questionID) + "/answers?include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%3Bdata%5B%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%2A%5D.topics&limit=5&offset=0&platform=desktop&sort_by=default"
answer_total = int(answer(url)['paging']['totals'])
offset = 0
while offset < answer_total:
url = "https://www.zhihu.com/api/v4/questions/" + str(
questionID) + "/answers?include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%3Bdata%5B%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%2A%5D.topics&limit=5&offset=" + str(
offset) + "&platform=desktop&sort_by=default"
offset += 5
print("----%s/%s----" % (offset, answer_total))
data = answer(url)['data']
for index, data_ in enumerate(data):
author_name = data[index]['author']['name']
author_token = data[index]['author']['url_token']
avatar_url = str(data[index]['author']['avatar_url']).replace("_is", "")
answer_content = data[index]['content']
voteup_count = data[index]['voteup_count']
if author_token == 'zhujiangren':
print("跳过盐选推荐")
continue
cursor = db.cursor()
cursor.execute(
"select count(*) from `%s` where content=%s",
(int(tableName), answer_content))
one = cursor.fetchone()
if one[0] > 0:
break
else:
cursor.execute(
"insert into `%s`(author_name,voteup_count,url_token,avatar_url,content) values (%s,%s,%s,%s,%s)",
(int(tableName), author_name, voteup_count, author_token, avatar_url, answer_content))
db.commit()
cursor.close()
print("成功获取回答! %s" % (author_name))
time.sleep(1)
time.sleep(1)
db.close()