分类
学习笔记

python爬知乎回答

首先,知乎获取回答的接口是这样的

https://www.zhihu.com/api/v4/questions/348488122/answers?include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%3Bdata%5B%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%2A%5D.topics&limit=5&offset=0&platform=desktop&sort_by=default

找到接口之后只要模拟知乎的请求头发送http请求就行了,也不需要任何验证,questions后面是问题的id,然后请求中的offset是指定获取第几条回答,其它的参数我不知道是什么意思,也用不到

下面就是用python写的完整的代码了,直接就能用

#!/usr/bin/python3
# -*- coding: utf-8 -*-

import json
import requests
import time
import sys
import mysql.connector


questionID = sys.argv[1]

# create database table


db = mysql.connector.connect(
    host="localhost",
    user="root",
    passwd="1234",
    database="zhihu"
)


def checkTableExists(dbcon, tablename):
    dbcur = dbcon.cursor()
    dbcur.execute("""
        SELECT COUNT(*)
        FROM information_schema.tables
        WHERE table_name = '{0}'
        """.format(tablename.replace('\'', '\'\'')))
    if dbcur.fetchone()[0] == 1:
        dbcur.close()
        return True

    dbcur.close()
    return False


tableName = questionID

if not checkTableExists(db, tableName):
    print("CREATE NEW TABLE")
    mycursor = db.cursor()
    mycursor.execute('''
    CREATE TABLE `%s` (
      `ID` int(11) NOT NULL AUTO_INCREMENT PRIMARY KEY,
      `author_name` text NOT NULL,
      `voteup_count` int(11) NOT NULL,
      `url_token` longtext NOT NULL,
      `avatar_url` longtext NOT NULL,
      `content` longtext NOT NULL
    ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
    ''' % (int(tableName)))
else:
    print("TABLE EXISTS!")

header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) '
                  'AppleWebKit/537.36 (KHTML, '
                  'like Gecko) Chrome/67.0.3396.99 '
                  'Safari/537.36',
    'Host': "www.zhihu.com",
    'Referer': "https://www.zhihu.com/question/" + str(questionID)
}


def answer(url_):
    r = requests.get(url_, headers=header)
    data = r.text
    jsonobj = json.loads(data)
    return jsonobj


url = "https://www.zhihu.com/api/v4/questions/" + str(
    questionID) + "/answers?include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%3Bdata%5B%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%2A%5D.topics&limit=5&offset=0&platform=desktop&sort_by=default"

answer_total = int(answer(url)['paging']['totals'])

offset = 0
while offset < answer_total:
    url = "https://www.zhihu.com/api/v4/questions/" + str(
        questionID) + "/answers?include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%3Bdata%5B%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%2A%5D.topics&limit=5&offset=" + str(
        offset) + "&platform=desktop&sort_by=default"


    offset += 5

    print("----%s/%s----" % (offset, answer_total))

    data = answer(url)['data']

    for index, data_ in enumerate(data):

        author_name = data[index]['author']['name']
        author_token = data[index]['author']['url_token']
        avatar_url = str(data[index]['author']['avatar_url']).replace("_is", "")
        answer_content = data[index]['content']

        voteup_count = data[index]['voteup_count']

        if author_token == 'zhujiangren':
            print("跳过盐选推荐")
            continue

        cursor = db.cursor()

        cursor.execute(
            "select count(*) from `%s` where content=%s",
            (int(tableName), answer_content))

        one = cursor.fetchone()

        if one[0] > 0:
            break
        else:
            cursor.execute(
                "insert into `%s`(author_name,voteup_count,url_token,avatar_url,content) values (%s,%s,%s,%s,%s)",
                (int(tableName), author_name, voteup_count, author_token, avatar_url, answer_content))
            db.commit()

        cursor.close()

        print("成功获取回答! %s" % (author_name))

        time.sleep(1)

    time.sleep(1)

db.close()

发表评论

您的电子邮箱地址不会被公开。 必填项已用*标注