python-3.x - 循环和爬虫项目并将元素保存为 Python 中的 json 格式
问题描述
从这里给出一个链接:
我想循环所有县,然后是所有商业区,将它们保存为json格式的txt文件,如下所示:
{"\u5317\u8521": "/ershoufang/beicai/", "\u78a7\u4e91": "/ershoufang/biyun/", "\u66f9\u8def": "/ershoufang/caolu/", "\u5ddd\u6c99": "/ershoufang/chuansha/", "\u5927\u56e2\u9547": "/ershoufang/datuanzhen/", ...}
我怎么能那样做?预先感谢。
代码:
from bs4 import BeautifulSoup
import requests
import os
from urllib.parse import urlparse
url = 'https://wh.lianjia.com/ershoufang/jiangan/'
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")
text = soup.find_all(text=True)
# xpath for counties
# counties: /html/body/div[3]/div/div[1]/dl[2]/dd/div[1]/div[1]/a[1]
# /html/body/div[3]/div/div[1]/dl[2]/dd/div[1]/div[1]/a[2]
# /html/body/div[3]/div/div[1]/dl[2]/dd/div[1]/div[1]/a[3]
# xpath for commercial districts
# /html/body/div[3]/div/div[1]/dl[2]/dd/div[1]/div[2]/a[1]
# /html/body/div[3]/div/div[1]/dl[2]/dd/div[1]/div[2]/a[2]
解决方案
这是你想要的吗?
import json
import requests
from bs4 import BeautifulSoup
base_url = "https://wh.lianjia.com"
start_url = f"{base_url}/ershoufang/jiangan/"
def get_page(url: str) -> bytes:
return requests.get(url).content
def make_soup(page: bytes) -> BeautifulSoup:
return BeautifulSoup(
page, "html.parser",
).find(
"div",
{"data-role": "ershoufang"},
)
def find_anchors(soup: BeautifulSoup, div_num: int) -> list:
return soup.find_all("div")[div_num].find_all("a")
countries = [
f"{base_url}{a['href']}" for a
in find_anchors(make_soup(get_page(start_url)), div_num=0)
]
districts = {}
for country in countries:
print(f"Fetching data for {country}")
districts.update(
{
a.getText(): a["href"]
for a in find_anchors(make_soup(get_page(country)), div_num=1)
}
)
with open("all_districts.json", "w") as jf:
json.dump(districts, jf, indent=4, sort_keys=True)
输出:
{
"CBD\u897f\u5317\u6e56": "/ershoufang/cbdxibeihu/",
"\u4e03\u91cc\u5e99": "/ershoufang/qilimiao/",
"\u4e09\u73af\u5357": "/ershoufang/sanhuannan/",
"\u4e09\u9633\u8def": "/ershoufang/sanyanglu/",
"\u4e1c\u6e56\u4e1c\u4ead": "/ershoufang/donghudongting/",
"\u4e1c\u897f\u6e56\u5176\u5b83": "/ershoufang/dongxihuqita/",
"\u4e2d\u5317\u8def": "/ershoufang/zhongbeilu/",
"\u4e2d\u5357\u4e01\u5b57\u6865": "/ershoufang/zhongnandingziqiao/",
"\u4e2d\u6cd5\u751f\u6001\u57ce": "/ershoufang/zhongfashengtaicheng/",
"\u4e8c\u4e03": "/ershoufang/erqi2/",
"\u5149\u8c37\u4e1c": "/ershoufang/guanggudong/",
"\u5149\u8c37\u5357": "/ershoufang/guanggunan/",
"\u5149\u8c37\u5e7f\u573a": "/ershoufang/guangguguangchang/",
"\u5173\u5c71\u5927\u9053": "/ershoufang/guanshandadao/",
"\u5173\u897f\u957f\u804c": "/ershoufang/guanxichangzhi/",
"\u524d\u5ddd": "/ershoufang/qianchuan/",
"\u524d\u8fdb\u6c5f\u6c49": "/ershoufang/qianjinjianghan/",
"\u534e\u79d1\u5927": "/ershoufang/huakeda/",
"\u5353\u5200\u6cc9": "/ershoufang/zhuodaoquan/",
"\u5357\u6e56\u6c83\u5c14\u739b": "/ershoufang/nanhuwoerma/",
"\u53e4\u7530": "/ershoufang/gutian/",
"\u53f0\u5317\u9999\u6e2f\u8def": "/ershoufang/taibeixiangganglu/",
"\u540e\u5b98\u6e56": "/ershoufang/houguanhu/",
"\u540e\u6e56": "/ershoufang/houhu/",
"\u5434\u5bb6\u5c71": "/ershoufang/wujiashan/",
"\u5510\u5bb6\u58a9": "/ershoufang/tangjiadun/",
"\u56db\u65b0": "/ershoufang/sixin/",
"\u56e2\u7ed3\u5927\u9053": "/ershoufang/tuanjiedadao/",
"\u5824\u89d2": "/ershoufang/dijiao/",
"\u5854\u5b50\u6e56": "/ershoufang/tazihu/",
"\u5927\u667a\u8def": "/ershoufang/dazhilu/",
"\u5b97\u5173": "/ershoufang/zongguan/",
"\u5b9d\u4e30\u5d07\u4ec1": "/ershoufang/baofengchongren/",
"\u5c06\u519b\u8def": "/ershoufang/jiangjunlu/",
"\u5e38\u9752\u82b1\u56ed": "/ershoufang/changqinghuayuan/",
"\u5e38\u9752\u8def": "/ershoufang/changqinglu/",
"\u5e99\u5c71": "/ershoufang/miaoshan/",
"\u5f90\u4e1c": "/ershoufang/xudong/",
"\u6587\u5316\u5927\u9053": "/ershoufang/wenhuadadao/",
"\u65b0\u534e\u8def\u4e07\u8fbe": "/ershoufang/xinhualuwanda/",
"\u65b0\u5357\u6e56": "/ershoufang/xinnanhu/",
"\u65b0\u6d32\u5176\u5b83": "/ershoufang/xinzhouqita/",
"\u6768\u56ed": "/ershoufang/yangyuan/",
"\u6768\u6c4a\u6e56": "/ershoufang/yangchahu/",
"\u695a\u6cb3\u6c49\u8857": "/ershoufang/chuhehanjie/",
"\u6b66\u5e7f\u4e07\u677e\u56ed": "/ershoufang/wuguangwansongyuan/",
"\u6b66\u660c\u706b\u8f66\u7ad9": "/ershoufang/wuchanghuochezhan/",
"\u6b66\u6e56": "/ershoufang/wuhu/",
"\u6c11\u65cf\u5927\u9053": "/ershoufang/minzudadao/",
"\u6c34\u679c\u6e56": "/ershoufang/shuiguohu/",
"\u6c49\u5357\u5176\u5b83": "/ershoufang/hannanqita/",
"\u6c49\u53e3\u5317": "/ershoufang/hankoubei/",
"\u6c49\u6b63\u8857": "/ershoufang/hanzhengjie/",
"\u6c5f\u590f\u5176\u5b83": "/ershoufang/jiangxiaqita/",
"\u6c8c\u53e3": "/ershoufang/dunkou/",
"\u6c99\u6e56": "/ershoufang/shahu/",
"\u6d2a\u5c71\u5176\u5b83": "/ershoufang/hongshanqita/",
"\u738b\u5bb6\u6e7e": "/ershoufang/wangjiawan/",
"\u73de\u72ee\u5357\u8def": "/ershoufang/luoshinanlu/",
"\u767d\u6c99\u6d32": "/ershoufang/baishazhou/",
"\u767e\u6b65\u4ead": "/ershoufang/baibuting/",
"\u76d8\u9f99\u57ce": "/ershoufang/panlongcheng/",
"\u79ef\u7389\u6865": "/ershoufang/jiyuqiao/",
"\u7eb8\u574a": "/ershoufang/zhifang/",
"\u8001\u5357\u6e56": "/ershoufang/laonanhu/",
"\u80b2\u624d\u82b1\u6865": "/ershoufang/yucaihuaqiao/",
"\u8521\u7538\u5176\u5b83": "/ershoufang/caidianqita/",
"\u8521\u7538\u57ce\u533a": "/ershoufang/caidianchengqu/",
"\u85cf\u9f99\u5c9b": "/ershoufang/canglongdao/",
"\u864e\u6cc9\u6768\u5bb6\u6e7e": "/ershoufang/huquanyangjiawan/",
"\u8857\u9053\u53e3": "/ershoufang/jiedaokou/",
"\u91d1\u878d\u6e2f": "/ershoufang/jinronggang/",
"\u91d1\u94f6\u6e56": "/ershoufang/jinyinhu/",
"\u949f\u5bb6\u6751": "/ershoufang/zhongjiacun/",
"\u957f\u4e30\u5e38\u7801\u5934": "/ershoufang/changfengchangmatou/",
"\u957f\u6e2f\u8def": "/ershoufang/changganglu/",
"\u9633\u903b": "/ershoufang/yangluo/",
"\u96c6\u8d24": "/ershoufang/jixian2/",
"\u9752\u5c71": "/ershoufang/qingshan1/",
"\u9996\u4e49": "/ershoufang/shouyi/",
"\u9ec4\u57d4\u6c38\u6e05": "/ershoufang/huangpuyongqing/",
"\u9ec4\u9642\u5176\u5b83": "/ershoufang/huangbeiqita/"
}
推荐阅读
- python - 校正时间序列中的时钟漂移
- r - ggplot2 中 facet_wrap() 的手动中断
- kubernetes - 是否有一个 GUI 可以像“kubectl cp”一样上传文件?
- vb.net - 我如何制作组合框以列出所有带有我在 vb 中键入的文本的值
- sftp - 有没有办法通过 SFTP 与远程文件夹同步?
- javascript - 如何在没有加密密码的情况下将“auth”与 Adonis 一起使用?
- java - 无法替换批处理文件中的参数值 - PostBuildScript 插件 Jenkins
- javascript - 如果我在我的反应项目中使用故事书,我可以从我的 js 包中删除它吗?
- sql - 如何在 Postgres 中更新一对多关系?
- websphere - Websphere 管理控制台重启