python-3.x - 用 python 爬行 Craiglisht(不是 Scrapy)
问题描述
我正在尝试使用 python 抓取 Craglist 作业(我没有使用scrapy)任何人都可以在下面的代码中解决吗?请不要谈论scrapy
这是网址:https ://chicago.craigslist.org/
起初我正在提取工作类别,然后是工作列表,然后是工作详细信息,还编写了抓取下一页的代码。
import re
import requests
import csv
from html import unescape
def get_page_content(url):
response = requests.get(url)
return response.text
def get_category_list(content):
return category_pat.findall(content)[90:121]
def get_next_page(content):
result = next_page_pat.findall(content)
if len(result) == 0:
return None
else:
result = 'https://chicago.craigslist.org/' + result[0]
return result
def get_job_list(content):
result = job_list_pat.findall(content)
return result
def get_job_details(content):
result = desc_pat.findall(content)
if len(result) == 0:
description = ''
else:
description = str(result[0])
return description
def scrape_job_info(job_info, category_name):
job_url, job_name = job_info
job_name = unescape(job_name)
job_dict = {'jobname': job_name, 'category': category_name}
job_dict['JOBURL'] = job_url
print('scraping', job_name)
content = get_category_list(job_url)
description = get_job_details(content)
job_dict['Description'] = description
print(job_dict)
def crawl_category(category_name, category_url):
while True:
print(category_url)
content = get_page_content(category_url)
job_list = get_job_list(content)
print(job_list)
for job_info in job_list:
scrape_job_info(job_info, category_name)
next_page = get_next_page(content)
if next_page is None:
break
category_url = next_page
def crawl_website():
url = 'https://chicago.craigslist.org'
content = get_page_content(url)
category_list = get_category_list(content)
for category in category_list:
category_url, category_name = category
category_url = url + category_url
crawl_category(category_name, category_url)
if __name__ == '__main__':
url = 'https://chicago.craigslist.org'
response = requests.get(url)
content = response.text
category_pat = re.compile(r'<li><a href=\"(\/d\/[\w\-]+\/\w+\/\w+)\".+txt\">([\w\-\+\s+\/\<]+)<sup class')
next_page_pat = re.compile(
r'<a href=\"\/(.*)\" class=\"button next\" title=\"next\s+page\">next > <\/a>\s+<span class=\"button next\" title=\"next page\">\s+next >\s+<\/span>\s+<\/span>\s+<\/div>\s+<\/div>\s+.+\s+.+')
job_list_pat = re.compile(r'<a href=\"(https://\w+\.craigslist.org/chc\/.+html)\".+hdrlnk\">([\w\s*]+)</a>')
desc_pat = re.compile(r'<\/div>\s*<section id=\"postingbody\">.+html\"><\/div>\s*<\/div>(.+)<\/section><ul')
img_pat = re.compile(r'<img src=\"(.*jpg)\" title')
crawl_website()
解决方案
推荐阅读
- django - 无法将从 GitHub 存储库克隆的 pip 包依赖项安装到单独的 virtualenv 中
- entity-framework - 违反主键,尝试插入外键表
- angular - 如何在最新的 Jhipster 中降级 Angular 版本?
- python - py 项目有效,但 pyinstaller 给出了与纸浆相关的错误
- gradle - 为 gradle java 进行干净构建时任务“:clean”执行失败
- javascript - 如何将存储在数组中的值作为 textContent 或 innerText 添加到具有相同类名的 div 元素
- python - Python搜索并替换为搜索结果
- sql-server - sql存储过程插入多行
- mongodb - 有没有办法使用 Linux docker 映像在本地测试 Azure CosmosDb?
- visual-studio - 为什么 DAX 编辑器在 Visual Studio 2017 中不可用?