获取WP网站内文章中失效的图片

import mysql.connector
import re
import requests
connection = mysql.connector.connect(host='localhost',
                                     database='wordpress',
                                     user='root',
                                     password='')

sql_select_Query = "select post_content,ID,post_title from wp_posts where post_status='publish' and post_type='post'"
cursor = connection.cursor()
cursor.execute(sql_select_Query)
# get all records
records = cursor.fetchall()

p = re.compile(r'<img.+?src=[\'"]([^\'"]+)[\'"].*?>')
fw=open('urls.txt','w', encoding='utf-8')
images = []

for i in records:
    # print(i[0])
    urls = p.findall(i[0])
    if len(urls):
        for url in urls:
            if(requests.get(url).status_code!=200):
                print(url)
                fw.write(str(i[1]) + ' ' + i[2] + '\n')
                fw.write(url + '\n')

python sqlite3 插入数据

import sqlite3
from sqlite3 import Error


def create_connection(db_file):
    """ create a database connection to the SQLite database
        specified by db_file
    :param db_file: database file
    :return: Connection object or None
    """
    conn = None
    try:
        conn = sqlite3.connect(db_file)
    except Error as e:
        print(e)

    return conn


def create_task(conn, task):
    """
    Create a new task
    :param conn:
    :param task:
    :return:
    """
    sql = ''' INSERT INTO main_post(title,content,category,created_on,tag,post_status)
              VALUES(?,?,?,?,?,?) '''
    cur = conn.cursor()
    cur.execute(sql, task)
    conn.commit()

    return cur.lastrowid


database = r"Desktop\django-blog\blog\db.sqlite3"

conn = create_connection(database)


## 可以反复使用
id = create_task(conn, (x[1], x[2], category, x[4], html, 1))

参考SQLite Python: Inserting Data

python爬知乎回答

首先,知乎获取回答的接口是这样的

https://www.zhihu.com/api/v4/questions/348488122/answers?include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%3Bdata%5B%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%2A%5D.topics&limit=5&offset=0&platform=desktop&sort_by=default

找到接口之后只要模拟知乎的请求头发送http请求就行了,也不需要任何验证,questions后面是问题的id,然后请求中的offset是指定获取第几条回答,其它的参数我不知道是什么意思,也用不到

下面就是用python写的完整的代码了,直接就能用

#!/usr/bin/python3
# -*- coding: utf-8 -*-

import json
import requests
import time
import sys
import mysql.connector


questionID = sys.argv[1]

# create database table


db = mysql.connector.connect(
    host="localhost",
    user="root",
    passwd="1234",
    database="zhihu"
)


def checkTableExists(dbcon, tablename):
    dbcur = dbcon.cursor()
    dbcur.execute("""
        SELECT COUNT(*)
        FROM information_schema.tables
        WHERE table_name = '{0}'
        """.format(tablename.replace('\'', '\'\'')))
    if dbcur.fetchone()[0] == 1:
        dbcur.close()
        return True

    dbcur.close()
    return False


tableName = questionID

if not checkTableExists(db, tableName):
    print("CREATE NEW TABLE")
    mycursor = db.cursor()
    mycursor.execute('''
    CREATE TABLE `%s` (
      `ID` int(11) NOT NULL AUTO_INCREMENT PRIMARY KEY,
      `author_name` text NOT NULL,
      `voteup_count` int(11) NOT NULL,
      `url_token` longtext NOT NULL,
      `avatar_url` longtext NOT NULL,
      `content` longtext NOT NULL
    ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
    ''' % (int(tableName)))
else:
    print("TABLE EXISTS!")

header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) '
                  'AppleWebKit/537.36 (KHTML, '
                  'like Gecko) Chrome/67.0.3396.99 '
                  'Safari/537.36',
    'Host': "www.zhihu.com",
    'Referer': "https://www.zhihu.com/question/" + str(questionID)
}


def answer(url_):
    r = requests.get(url_, headers=header)
    data = r.text
    jsonobj = json.loads(data)
    return jsonobj


url = "https://www.zhihu.com/api/v4/questions/" + str(
    questionID) + "/answers?include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%3Bdata%5B%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%2A%5D.topics&limit=5&offset=0&platform=desktop&sort_by=default"

answer_total = int(answer(url)['paging']['totals'])

offset = 0
while offset < answer_total:
    url = "https://www.zhihu.com/api/v4/questions/" + str(
        questionID) + "/answers?include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%3Bdata%5B%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%2A%5D.topics&limit=5&offset=" + str(
        offset) + "&platform=desktop&sort_by=default"


    offset += 5

    print("----%s/%s----" % (offset, answer_total))

    data = answer(url)['data']

    for index, data_ in enumerate(data):

        author_name = data[index]['author']['name']
        author_token = data[index]['author']['url_token']
        avatar_url = str(data[index]['author']['avatar_url']).replace("_is", "")
        answer_content = data[index]['content']

        voteup_count = data[index]['voteup_count']

        if author_token == 'zhujiangren':
            print("跳过盐选推荐")
            continue

        cursor = db.cursor()

        cursor.execute(
            "select count(*) from `%s` where content=%s",
            (int(tableName), answer_content))

        one = cursor.fetchone()

        if one[0] > 0:
            break
        else:
            cursor.execute(
                "insert into `%s`(author_name,voteup_count,url_token,avatar_url,content) values (%s,%s,%s,%s,%s)",
                (int(tableName), author_name, voteup_count, author_token, avatar_url, answer_content))
            db.commit()

        cursor.close()

        print("成功获取回答! %s" % (author_name))

        time.sleep(1)

    time.sleep(1)

db.close()

Python lambda 匿名函数

在计算机编程中,匿名函数(英语:anonymous function)是指一类无需定义标识符(函数名)的函数或子程序,普遍存在于多种编程语言中。

在Python中,我们可以通过使用lambda表达式来创建匿名函数,lambda函数的语法也非常简单:

lambda argument_list:expression

参数由逗号分隔的参数列表组成,表达式是使用这些参数的算术表达式且只能有一个,即不用写return。您可以将该函数分配给一个变量以赋予其名称。

匿名函数的好处在于它没有名字,所以你不用担心变量名会起冲突。

下面的lambda函数示例返回其两个参数的总和:

>>> sum = lambda x,y:x + y
>>> sum(3,4)
7
>>>

将上面的代码写成一般的函数表达式是这样的:

>>> def sum(x,y):
...     return x + y
...
>>> sum(3,4)
7
>>>

map()函数

map()是一个带有两个参数的函数,它接受一个函数和一个列表,然后将列表中的每一项都应用于该函数中,map()函数也被称为迭代器。

例如我们通过map()函数计算一个列表中每一项的平方,可以直接在map()中传入一个匿名函数(lambda)

>>> list(map(lambda x: x * x, [1, 2, 3, 4, 5, 6, 7, 8, 9]))
[1, 4, 9, 16, 25, 36, 49, 64, 81]

下面有一个更直观的例子让你看到它的优点:

>>> def fahrenheit(T):
...     return ((float(9)/5)*T + 32)
...
>>> def celsius(T):
...     return (float(5)/9)*(T-32)
...
>>> temperatures = (36.5, 37, 37.5, 38, 39)
>>> F = map(fahrenheit, temperatures)
>>> C = map(celsius, F)
>>>
>>> temperatures_in_Fahrenheit = list(map(fahrenheit, temperatures))
>>> temperatures_in_Celsius = list(map(celsius, temperatures_in_Fahrenheit))
>>> print(temperatures_in_Fahrenheit)
[97.7, 98.60000000000001, 99.5, 100.4, 102.2]
>>> print(temperatures_in_Celsius)
[36.5, 37.00000000000001, 37.5, 38.00000000000001, 39.0]
>>>

上面是一个计算华氏度和摄氏度的例子,在下面,我们使用lambda匿名函数来重写,你可以看到整个代码瞬间简洁了不少。

>>> C = [39.2, 36.5, 37.3, 38, 37.8]
>>> F = list(map(lambda x: (float(9)/5)*x + 32, C))
>>> print(F)
[102.56, 97.7, 99.14, 100.4, 100.03999999999999]
>>> C = list(map(lambda x: (float(5)/9)*(x-32), F))
>>> print(C)
[39.2, 36.5, 37.300000000000004, 38.00000000000001, 37.8]
>>>

参考

Python Generator

生成器(Generator)是Python语言中一个很独特的特性,生成器可以是一个函数,这个generator函数一定会包含yield语句,我们知道,在一般函数中,遇到return语句时就会终止执行返回结果,而在generator中,当for对一组可迭代项目进行迭代时,一旦generator函数运行到yield语句处就会停止执行,然后从项目中获取一个新值再从yield处继续执行。

生成器表达式

跟列表生成式一样,生成器也可以用类似表达式的形式写出来,只需要把[]改成()就可以创建一个generator。

iterator = (i for i in range(3))

你可以使用next()函数来获取generator的下一个值,就像这样

>>> next(iterator)
>>> next(iterator)
1
>>> next(iterator)
2
>>> next(iterator)
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
StopIteration

当next()第一次被调用时,执行从函数体的开始处开始并继续,直到yield处返回结果,随后的调用next()从上次yield语句处继续到函数的结尾,直到没有值的时候就会抛出StopIteration错误。

但是事实上我们不可能一直调用next函数,如果要打印generator所有的值,可以使用for循环,就像这样

>>> iterator = (i for i in range(3))
>>> for item in iterator:
...     print(item)
...
1
2

generator函数

生成器表达式可以用来表达一些比较简单的迭代,但是遇到较复杂的条件就没办法写了,所以这个时候就可以用generator函数来表示比较复杂的逻辑关系。

比如斐波拉契数列,从第三个数开始每个数等于前两个数之和,如果用函数来表示我们可以这样写

def fib(max):
    n, a, b = 0, 0, 1
    while n < max:
        print(b)
        a, b = b, a + b
        n = n + 1
    return 'done'

上面是一个一般函数,如果要把它变成generator函数我们只需要把输出语句print(b)改成yield语句即可,就像这样

def gen_fib(max):
    n, a, b = 0, 0, 1
    while n < max:
        yield b
        a, b = b, a + b
        n = n + 1
    return 'done'

同样的,刚开始说过,一个函数包含了yield语句那它就是一个generator函数

>>> fib(10)
1
1
2
3
5
8
13
21
34
55
'done'
>>> gen_fib(10)
<generator object gen_fib at 0x0000024A02DC5B48>
>>> f = gen_fib(10)
>>> next(f)
1
>>> next(f)
1
>>> next(f)
2
>>> next(f)
3
>>> next(f)
5