import mysql.connector
import re
import requests
connection = mysql.connector.connect(host='localhost',
database='wordpress',
user='root',
password='')
sql_select_Query = "select post_content,ID,post_title from wp_posts where post_status='publish' and post_type='post'"
cursor = connection.cursor()
cursor.execute(sql_select_Query)
# get all records
records = cursor.fetchall()
p = re.compile(r'<img.+?src=[\'"]([^\'"]+)[\'"].*?>')
fw=open('urls.txt','w', encoding='utf-8')
images = []
for i in records:
# print(i[0])
urls = p.findall(i[0])
if len(urls):
for url in urls:
if(requests.get(url).status_code!=200):
print(url)
fw.write(str(i[1]) + ' ' + i[2] + '\n')
fw.write(url + '\n')
标签: python
import sqlite3
from sqlite3 import Error
def create_connection(db_file):
""" create a database connection to the SQLite database
specified by db_file
:param db_file: database file
:return: Connection object or None
"""
conn = None
try:
conn = sqlite3.connect(db_file)
except Error as e:
print(e)
return conn
def create_task(conn, task):
"""
Create a new task
:param conn:
:param task:
:return:
"""
sql = ''' INSERT INTO main_post(title,content,category,created_on,tag,post_status)
VALUES(?,?,?,?,?,?) '''
cur = conn.cursor()
cur.execute(sql, task)
conn.commit()
return cur.lastrowid
database = r"Desktop\django-blog\blog\db.sqlite3"
conn = create_connection(database)
## 可以反复使用
id = create_task(conn, (x[1], x[2], category, x[4], html, 1))
首先,知乎获取回答的接口是这样的
https://www.zhihu.com/api/v4/questions/348488122/answers?include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%3Bdata%5B%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%2A%5D.topics&limit=5&offset=0&platform=desktop&sort_by=default
找到接口之后只要模拟知乎的请求头发送http请求就行了,也不需要任何验证,questions后面是问题的id,然后请求中的offset是指定获取第几条回答,其它的参数我不知道是什么意思,也用不到
下面就是用python写的完整的代码了,直接就能用
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import json
import requests
import time
import sys
import mysql.connector
questionID = sys.argv[1]
# create database table
db = mysql.connector.connect(
host="localhost",
user="root",
passwd="1234",
database="zhihu"
)
def checkTableExists(dbcon, tablename):
dbcur = dbcon.cursor()
dbcur.execute("""
SELECT COUNT(*)
FROM information_schema.tables
WHERE table_name = '{0}'
""".format(tablename.replace('\'', '\'\'')))
if dbcur.fetchone()[0] == 1:
dbcur.close()
return True
dbcur.close()
return False
tableName = questionID
if not checkTableExists(db, tableName):
print("CREATE NEW TABLE")
mycursor = db.cursor()
mycursor.execute('''
CREATE TABLE `%s` (
`ID` int(11) NOT NULL AUTO_INCREMENT PRIMARY KEY,
`author_name` text NOT NULL,
`voteup_count` int(11) NOT NULL,
`url_token` longtext NOT NULL,
`avatar_url` longtext NOT NULL,
`content` longtext NOT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
''' % (int(tableName)))
else:
print("TABLE EXISTS!")
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) '
'AppleWebKit/537.36 (KHTML, '
'like Gecko) Chrome/67.0.3396.99 '
'Safari/537.36',
'Host': "www.zhihu.com",
'Referer': "https://www.zhihu.com/question/" + str(questionID)
}
def answer(url_):
r = requests.get(url_, headers=header)
data = r.text
jsonobj = json.loads(data)
return jsonobj
url = "https://www.zhihu.com/api/v4/questions/" + str(
questionID) + "/answers?include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%3Bdata%5B%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%2A%5D.topics&limit=5&offset=0&platform=desktop&sort_by=default"
answer_total = int(answer(url)['paging']['totals'])
offset = 0
while offset < answer_total:
url = "https://www.zhihu.com/api/v4/questions/" + str(
questionID) + "/answers?include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%3Bdata%5B%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%2A%5D.topics&limit=5&offset=" + str(
offset) + "&platform=desktop&sort_by=default"
offset += 5
print("----%s/%s----" % (offset, answer_total))
data = answer(url)['data']
for index, data_ in enumerate(data):
author_name = data[index]['author']['name']
author_token = data[index]['author']['url_token']
avatar_url = str(data[index]['author']['avatar_url']).replace("_is", "")
answer_content = data[index]['content']
voteup_count = data[index]['voteup_count']
if author_token == 'zhujiangren':
print("跳过盐选推荐")
continue
cursor = db.cursor()
cursor.execute(
"select count(*) from `%s` where content=%s",
(int(tableName), answer_content))
one = cursor.fetchone()
if one[0] > 0:
break
else:
cursor.execute(
"insert into `%s`(author_name,voteup_count,url_token,avatar_url,content) values (%s,%s,%s,%s,%s)",
(int(tableName), author_name, voteup_count, author_token, avatar_url, answer_content))
db.commit()
cursor.close()
print("成功获取回答! %s" % (author_name))
time.sleep(1)
time.sleep(1)
db.close()
在计算机编程中,匿名函数(英语:anonymous function)是指一类无需定义标识符(函数名)的函数或子程序,普遍存在于多种编程语言中。
在Python中,我们可以通过使用lambda
表达式来创建匿名函数,lambda函数的语法也非常简单:
lambda argument_list:expression
参数由逗号分隔的参数列表组成,表达式是使用这些参数的算术表达式且只能有一个,即不用写return。您可以将该函数分配给一个变量以赋予其名称。
匿名函数的好处在于它没有名字,所以你不用担心变量名会起冲突。
下面的lambda函数示例返回其两个参数的总和:
>>> sum = lambda x,y:x + y
>>> sum(3,4)
7
>>>
将上面的代码写成一般的函数表达式是这样的:
>>> def sum(x,y):
... return x + y
...
>>> sum(3,4)
7
>>>
map()函数
map()是一个带有两个参数的函数,它接受一个函数和一个列表,然后将列表中的每一项都应用于该函数中,map()函数也被称为迭代器。
例如我们通过map()函数计算一个列表中每一项的平方,可以直接在map()中传入一个匿名函数(lambda)
>>> list(map(lambda x: x * x, [1, 2, 3, 4, 5, 6, 7, 8, 9]))
[1, 4, 9, 16, 25, 36, 49, 64, 81]
下面有一个更直观的例子让你看到它的优点:
>>> def fahrenheit(T):
... return ((float(9)/5)*T + 32)
...
>>> def celsius(T):
... return (float(5)/9)*(T-32)
...
>>> temperatures = (36.5, 37, 37.5, 38, 39)
>>> F = map(fahrenheit, temperatures)
>>> C = map(celsius, F)
>>>
>>> temperatures_in_Fahrenheit = list(map(fahrenheit, temperatures))
>>> temperatures_in_Celsius = list(map(celsius, temperatures_in_Fahrenheit))
>>> print(temperatures_in_Fahrenheit)
[97.7, 98.60000000000001, 99.5, 100.4, 102.2]
>>> print(temperatures_in_Celsius)
[36.5, 37.00000000000001, 37.5, 38.00000000000001, 39.0]
>>>
上面是一个计算华氏度和摄氏度的例子,在下面,我们使用lambda匿名函数来重写,你可以看到整个代码瞬间简洁了不少。
>>> C = [39.2, 36.5, 37.3, 38, 37.8]
>>> F = list(map(lambda x: (float(9)/5)*x + 32, C))
>>> print(F)
[102.56, 97.7, 99.14, 100.4, 100.03999999999999]
>>> C = list(map(lambda x: (float(5)/9)*(x-32), F))
>>> print(C)
[39.2, 36.5, 37.300000000000004, 38.00000000000001, 37.8]
>>>
参考
生成器(Generator)是Python语言中一个很独特的特性,生成器可以是一个函数,这个generator函数一定会包含yield
语句,我们知道,在一般函数中,遇到return语句时就会终止执行返回结果,而在generator中,当for对一组可迭代项目进行迭代时,一旦generator函数运行到yield语句处就会停止执行,然后从项目中获取一个新值再从yield处继续执行。
生成器表达式
跟列表生成式一样,生成器也可以用类似表达式的形式写出来,只需要把[]
改成()
就可以创建一个generator。
iterator = (i for i in range(3))
你可以使用next()
函数来获取generator的下一个值,就像这样
>>> next(iterator)
>>> next(iterator)
1
>>> next(iterator)
2
>>> next(iterator)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
StopIteration
当next()第一次被调用时,执行从函数体的开始处开始并继续,直到yield处返回结果,随后的调用next()从上次yield语句处继续到函数的结尾,直到没有值的时候就会抛出StopIteration错误。
但是事实上我们不可能一直调用next函数,如果要打印generator所有的值,可以使用for循环,就像这样
>>> iterator = (i for i in range(3))
>>> for item in iterator:
... print(item)
...
1
2
generator函数
生成器表达式可以用来表达一些比较简单的迭代,但是遇到较复杂的条件就没办法写了,所以这个时候就可以用generator函数来表示比较复杂的逻辑关系。
比如斐波拉契数列,从第三个数开始每个数等于前两个数之和,如果用函数来表示我们可以这样写
def fib(max):
n, a, b = 0, 0, 1
while n < max:
print(b)
a, b = b, a + b
n = n + 1
return 'done'
上面是一个一般函数,如果要把它变成generator函数我们只需要把输出语句print(b)改成yield语句即可,就像这样
def gen_fib(max):
n, a, b = 0, 0, 1
while n < max:
yield b
a, b = b, a + b
n = n + 1
return 'done'
同样的,刚开始说过,一个函数包含了yield语句那它就是一个generator函数
>>> fib(10)
1
1
2
3
5
8
13
21
34
55
'done'
>>> gen_fib(10)
<generator object gen_fib at 0x0000024A02DC5B48>
>>> f = gen_fib(10)
>>> next(f)
1
>>> next(f)
1
>>> next(f)
2
>>> next(f)
3
>>> next(f)
5