首页 > 技术文章 > Python 爬取页面内容

isungge 2019-09-27 15:10 原文

import urllib.request
import requests
from bs4 import BeautifulSoup

url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/12/1201.html"
headers = ("User-Agent","Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36")
opener = urllib.request.build_opener()
opener.addheaders = [headers]
data = opener.open(url).read()
content = data.decode('GB2312')
soup = BeautifulSoup(content, 'html.parser')
print(soup.find_all('a'))

for link in soup.find_all('a'):
    print('url:',link.attrs['href'])
    print('text:',link.get_text('title'))

 

推荐阅读