scrapy爬取网站,并下载照片
-
items中定义实体
-
spider中编写爬取逻辑,解决爬取到的数据为空或者带有emo表情四位编码的数据,进行清除
bookDesc = book.xpath('.//p/text()').get()
if bookDesc == None:
item["bookDesc"] = ""
else:
bookDesc = bookDesc.strip().replace("\n","")
item["bookDesc"] = reduce(lambda x, y: x + y, filter(lambda x: len(x.encode('utf8')) < 4, bookDesc))
-
settings开启一系列东西,
IMAGES_STORE = 'images'
-
pipelines编写照片保存代码,以及图片重命名
class DoubanImagePipeline(ImagesPipeline): def get_media_requests(self, item, info): yield scrapy.Request(item["bookImageLink"],meta={'name':item['bookName']}) def file_path(self, request, response=None, info=None): name = request.meta['name'] # name = re.sub(r'[?\\*|“<>:/]', '', name) #防止一些图片名称稀奇古怪的 imageName = name + '.jpg' return imageName def item_completed(self, results, item, info): return item
-
pipelines编写数据保存至csv文件代码,编码为utf8
class ToCsvPipeline(object):
def __init__(self):
# 打开文件,指定方式为写,利用第3个参数把csv写数据时产生的空行消除
self.f = open("doubandushu.csv", "a", newline="",encoding='utf8')
# 设置文件第一行的字段名,注意要跟spider传过来的字典key名称相同
self.fieldnames = ["bookName", "bookScore", "bookDesc", "bookScoreNumber", "bookImageLink"]
# 指定文件的写入方式为csv字典写入,参数1为指定具体文件,参数2为指定字段名
self.writer = csv.DictWriter(self.f, fieldnames=self.fieldnames)
# 写入第一行字段名,因为只要写入一次,所以文件放在__init__里面
self.writer.writeheader()
def process_item(self, item, spider):
# 写入spider传过来的具体数值
self.writer.writerow(item)
# 写入完返回
return item
def close(self, spider):
self.f.close()
- pipelines编写数据保存至mysql数据库代码,中文存储带上编码utf8
class ToMysqlPipeline(object):
def __init__(self):
# 建立连接
self.conn = pymysql.connect(
host = "39.101.142.214",
user = "root",
password = "123456",
database = "mydb1",
charset = "utf8") # 有中文要存入数据库的话要加charset='utf8'
# 创建游标
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
# sql语句
insert_sql = """
insert into doubandushu(book_name,book_score,book_desc,book_score_number,book_image_link) VALUES(%s,%s,%s,%s,%s)
"""
# 执行插入数据到数据库操作
self.cursor.execute(insert_sql, (item['bookName'], item['bookScore'], item['bookDesc'],
item['bookScoreNumber'], item['bookImageLink']))
# 提交,不进行提交无法保存到数据库
self.conn.commit()
def close_spider(self, spider):
# 关闭游标和连接
self.cursor.close()
self.conn.close()
- main文件利用scrapy自带的cmdline编写一个启动函数入口
from scrapy import cmdline
if __name__ == '__main__':
cmdline.execute('scrapy crawl doubandushu'.split())