假定有这么个场景,爬虫爬取的数据以json格式保存,即像python字典的字符串。
1 # -*- coding=utf-8 -*- 2 import json 3 import re 4 import os 5 6 # 包含多个字典的列表 7 movie_dict_list = [ 8 {"movie_rating": '8.6', "movie_column": '剧情/冒险/西部', "movie_title": "被解救的姜戈", "movie_director":'Quentin'}, 9 {"movie_rating": '8.4', "movie_column": '剧情/犯罪', "movie_title": "无耻混蛋", "movie_director":'Quentin'}, 10 {"movie_rating": '8.8', "movie_column": '剧情/喜剧/犯罪', "movie_title": "低俗小说","movie_director":'Quentin'}, 11 {"movie_rating": '9.1', "movie_column": '剧情/动作/科幻/犯罪/惊悚', "movie_title": "蝙蝠侠:黑暗骑士", "movie_director":'Nolan'}, 12 {"movie_rating": '8.5', "movie_column": '剧情/悬疑/惊悚/犯罪', "movie_title": "记忆碎片","movie_director":'Nolan'}, 13 {"movie_rating": '8.8', "movie_column": '剧情/悬疑/惊悚', "movie_title": "致命魔术","movie_director":'Nolan'}] 14 15 # 将字典写入json格式的文件 16 with open('F:/movie.json','wb') as file: 17 for movie_dict in movie_dict_list: 18 file.write(json.dumps(movie_dict,ensure_ascii=False) + '\n') 19 20 # movie.json 21 '''{ 22 "movie_rating": '8.6', 23 "movie_director": "Quentin", 24 "movie_title": "被解救的姜戈", 25 "movie_column": "剧情/冒险/西部" 26 } { 27 "movie_rating": '8.4', 28 "movie_director": "Quentin", 29 "movie_title": "无耻混蛋", 30 "movie_column": "剧情/犯罪" 31 } { 32 "movie_rating": '8.8', 33 "movie_director": "Quentin", 34 "movie_title": "低俗小说", 35 "movie_column": "剧情/喜剧/犯罪" 36 } { 37 "movie_rating": '9.1', 38 "movie_director": "Nolan", 39 "movie_title": "蝙蝠侠:黑暗骑士", 40 "movie_column": "剧情/动作/科幻/犯罪/惊悚" 41 } { 42 "movie_rating": '8.5', 43 "movie_director": "Nolan", 44 "movie_title": "记忆碎片", 45 "movie_column": "剧情/悬疑/惊悚/犯罪" 46 } { 47 "movie_rating": '8.8', 48 "movie_director": "Nolan", 49 "movie_title": "致命魔术", 50 "movie_column": "剧情/悬疑/惊悚" 51 }''' 52 53 54 55 # 将json中的unicode字符串转换为str,(递归的把list和dict里的Unicode对象encode成str。) 56 # def byteify(input): 57 # if isinstance(input, dict): 58 # return {byteify(key):byteify(value) for key,value in input.iteritems()} 59 # elif isinstance(input, list): 60 # return [byteify(element) for element in input] 61 # elif isinstance(input, unicode): 62 # return input.encode('utf-8') 63 # else: 64 # return input 65 66 67 # 将json文件转为python字典,逐行读取,应对大文件 68 f = open('F:/movie.json','rb') 69 movie_dict_list = [] 70 for line in f: 71 movie_a_line = json.loads(line) 72 73 # 题外话,假如上面将字典写入json文件时未添加禁用ASCII码 ,那么json文件将以unicode字符串保存, 74 # 需要调用byteify函数,将json文件中的unicode字符串转换为python的str 75 # movie_a_line = byteify(json.loads(line)) 76 77 movie_dict_list.append(movie_a_line) 78 f.close() 79 80 # 提取字典中的某一个键作为文件名 81 for movie_dict in movie_dict_list: 82 folder = 'F:/test_dict_to_folder/' + movie_dict.get('movie_director') 83 if not os.path.exists(folder): 84 os.mkdir(folder) 85 # 正则用于剔除windows文件名中的非法字符 86 file = open(folder + '/' + re.sub(r'[?\\*|“<>:/]','', movie_dict.get('movie_title')) + '.txt','w') 87 file.write(movie_dict.get('movie_column') + '\n' + movie_dict.get('movie_rating')) 88 file.close() 89 print 'Done!'
以上带并不是一个完整的脚本,只是把Python相关的基础知识点糅杂在一起,如有不足,欢迎指正。