0%

爬虫常用操作笔记

记录一些常用的代码,免得老去翻代码

0.配置

创建项目

1
scrapy startproject adb

创建爬虫

1
2
cd adb
scrapy genspider 爬虫名字 ***.com

解释器地址

1
D:\Anaconda3\Lib\site-packages\scrapy\cmdline.py

参数

1
crawl 爬虫名字

1.翻页

1
2
3
4
5
# 翻页
next_page = response.xpath('//div[@class="page"]/a[contains(text(),"下一页")]/@href').extract_first()
if next_page:
next_url = 'https://www.2345daohang.com' + next_page
yield scrapy.Request(next_url, callback=self.bookList, meta={'category': response.meta['category']})

2.content处理

1
2
content = element.xpath('string(.)').replace('\xa0', '').replace('a("conten");', '').split('\n')
item['content'] = [i for i in content if i != '']

3.批量入库

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
# -*-coding:utf-8-*-

import csv
import pymysql

filename = "******\\yangsheng_info.csv"

with open(filename, 'r', encoding="utf-8") as f:
reader = csv.reader(f)
data = list(reader)
data.pop(0)

db = pymysql.connect('182.92.226.**', 'root', '密码', 'theOld')
cursor = db.cursor()

sql = "INSERT INTO main_data_tougaolanmu(theme,name,content) VALUES(%s,%s,%s)"
cursor.executemany(sql, data)

print("数据库导入操作")

db.commit()
db.close

4.逐条入库

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# -*-coding:utf-8-*-

import csv
import pymysql

filename = "***************\\minsu_info.csv"


with open(filename, 'r', encoding="utf-8") as f:
reader = csv.reader(f)
data = list(reader)
data.pop(0)

db = pymysql.connect('182.92.226.**', 'root', '密码', 'theOld')
cursor = db.cursor()

i = 0
for dd in data:
i += 1
sql1 = "SELECT id from user_userInfo WHERE nickname = %s"
if dd[1] == '佚名':
dd[1] = '官方2号'
cursor.execute(sql1, dd[1]) # 执行SQL语句
results = cursor.fetchall() # 获取所有记录列表
if results:
sql2 = "INSERT INTO main_data_tougaolanmu(secondTypeId,name,userId,content,modifiedTime) VALUES(38,%s,%s,%s,NOW())"
dad = dd
dad[1] = results[0][0]
try:
# 执行SQL语句
cursor.execute(sql2, dad)
# 提交修改
db.commit()
except:
# 发生错误时回滚
print('出错1')
db.rollback()
else:
sql3 = "INSERT INTO user_userInfo(nickname) VALUES(%s)"
try:
# 执行SQL语句
cursor.execute(sql3, dd[1])
# 提交修改
db.commit()
except:
# 发生错误时回滚
print('出错2')
db.rollback()
cursor.execute(sql1, dd[1]) # 执行SQL语句
result = cursor.fetchall() # 获取所有记录列表
sql4 = "INSERT INTO main_data_tougaolanmu(secondTypeId,name,userId,content,modifiedTime) VALUES(38,%s,%s,%s,NOW())"
dad = dd
dad[1] = result[0][0]
try:
# 执行SQL语句
cursor.execute(sql4, dad)
# 提交修改
db.commit()
except:
print('出错3')
# 发生错误时回滚
db.rollback()
print(i, results)

db.close

5.获取HTML

scrapy的原生的response可以直接调用.text()函数,但是当response调用过xpath之后虽然使用dir()显示还是有text属性但是并不能调用,此时可以使用getall()获取对应的html,之后再用正则表达式,split()之类的进行数据的处理。

1
2
content = response.xpath('//div[@class="ct tt zooms"]')[0].getall()[0]
item['content'] = re.sub(r'href="([^"])*[^=k]"', "", content)

6.创建CSV

csv_init.py
1
2
3
4
5
6
import csv

with open('../../book_info.csv', 'a', encoding="utf-8", newline='') as file_obj:
writer = csv.writer(file_obj)
row = ["category", "name", "cover", "author", "intro"]
writer.writerow(row)

7.管道

1
2
3
4
with open('11_1900.csv', 'a', encoding="utf-8", newline='') as file_obj:
writer = csv.writer(file_obj)
row = [item["name"], item["othername"], item['author'], item['country'], item['time'], item['intro']]
writer.writerow(row)
-------------本 文 结 束 啦 感 谢 您 的 阅 读-------------

欢迎关注我的其它发布渠道