首页 > 技术文章 > python爬虫

tiancx 2021-10-06 12:45 原文

Python爬虫

爬虫步骤:

爬取步骤

​ step1. download Web html code :requests.get/post

​ step2. parse the html code from step1. lxml.etree.HTML()

​ step3. save the parsed data witch from step2

保存到数据库:

​ pymysql connect to mysql database steps:

​ 1.to get connect object

​ 2.get cursor object from connect

​ 3.execute query string from cursor

​ 4.commit transaction and close resource(connect,cursor)/rollback

环境

cmd 窗口

python --version

pip --version
pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
pip install pymysql
pip list | find "requests"

python -m pip install pip -U (更新)

vscode

安装python , kite , jupyter

第一个程序

from os import write
from typing import Text
import requests

#请求百度的结果   r 代表响应
try:
    r = requests.get("http://www.baidu.com")

    # r.request.headers  请求的头部
    
    r.encoding = 'utf-8'
    r.raise_for_ststus()
    r.history
    r.content #这是二进制内容
except:
    print("error")
print(r.text)

# 存入txt
with open("1.txt","w",encoding="utf-8") as f:
    f.write(r.text)
#运行单元|运行本单元上方|调试单元
#with可以自动关闭

汉字字符集编码查询;中文字符集编码:GB2312、BIG5、GBK、GB18030、Unicode (qqxiuzi.cn)

html状态码(需要记住)

导包的路径

安装路径下的lib路径下的site里

*参数

一个 * : 元组方式

两个 ** : 字典方式

XHR

xml http requests

登录校园网

payload = {"username":"11",
            "password":"xN94pkdfNwM=",
            "authCode":"",
            "It":"abcd1234",
            "execution":"e3s2",
            "_eventId":"submit",
            "isQrSubmit":"false",
            "qrValue":"",
            "isMobileLogin":"false"}
url="http://a.cqie.edu.cn/cas/login?service=http://i.cqie.edu.cn/portal_main/toPortalPage"
r = requests.post(url,data=payload,timeout=3)
r.status_code

实验2

1

import requests
from lxml import etree

def down_html(url):#获取网页内容
    try:
        r=requests.get(url)#发送请求
        r.raise_for_status() #非正常返回爬出异常
        r.encoding=r.apparent_encoding#设置返回内容的字符集编码
        return r.text
    except Exception:
        print("download page error")

#网页解析
def parse_html(html):
    data=etree.HTML(html)
    title=data.xpath('//div[@id="u1"]/a/text()')
    url=data.xpath('//div[@id="u1"]/a/@href')
    result=dict()
    for i in range(0,len(title)):
        result[title[i]]=url[i]
    return result

if __name__=='__main__':
    url="http://www.baidu.com"
    for k,v in parse_html(down_html(url)).items():
        print(k+"-->"+v)

2

import requests
import csv
from lxml import etree

#下载网页
def download_page(url):
    try:
        r=requests.get(url)#发送请求
        r.raise_for_status() #非正常返回爬出异常
        r.encoding=r.apparent_encoding #设置返回内容的字符集编码
        return r.text #返回网页的文本内容
    except Exception:
        pass

#解析网页
def parse_html(html):
    data=etree.HTML(html)
    books=[]
    for book in data.xpath('//*[@id="tag-book"]/div/ul/li'):
        name=book.xpath("div[2]/h4/a/text()")[0].strip() #书名
        author=book.xpath('div[2]/div/span/text()')[0].strip()#作者
        price=book.xpath("div[2]/span/span/text()")[0].strip()#价格
        details_url=book.xpath("div[2]/h4/a/@href")[0].strip()#详情页地址
        book=[name,author,price,details_url]
        books.append(book)
    return books

#保存数据
def save_data(file,data): #path文件保存路径,item数据列表
    with open(file,"w+") as f:
        writer = csv.writer(f)
        for row in data:
            writer.writerow(row)

    print("data saved successfully")

if __name__ == '__main__':
    url="https://www.ryjiaoyu.com/tag/details/7"
    save_data("book4.csv",parse_html(download_page(url)))

静态网页爬取

robots协议

文件存储

txt文本存储

image-20211008142942887

csv文件存储

import requests
from lxml import etree

#下载网页
def download_html(url):
    try:
        r= requests.get(url)
        r.raise_for_status()
        r.encoding=r.apparent_encoding
        return r.text
    except Exception  as e:
        print(e)

#解析网页
def parse_html(html):
    data=etree.HTML(html)
    titles=data.xpath("//*[@id=\"colR\"]/div[2]/dl/dd/ul/li/a/text()")
    dates=data.xpath("//*[@id=\"colR\"]/div[2]/dl/dd/ul/li/span/text()")
    print(list(zip(titles,dates)))
    return list(zip(titles,dates))
#保存数据
def save_data(data):
    with open("news1.txt","w",encoding="utf-8") as f:
        for item in data:
            f.write(",".join(item)+"\n")
    print("news title already saved successfully")


if __name__ == '__main__':
    url="http://www.cqie.edu.cn/html/2/xydt/"
    print(save_data(parse_html(download_html(url))))

PyMySQL

4个步骤

  • 创建数据库链接对象 con
  • 获取游标对象 cursor
  • 执行SQL语句
  • 提交事务,关闭链接

image-20211011091333963

爬取内容存入到数据库

实验

import requests
from lxml import etree
import pymysql
import xlwt

#定义下载网页内容的函数
def download (url):
    #伪装浏览
    headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36' }
    #使用get方法获取网页内容
    response=requests.get(url=url,headers=headers)
    if response.status_code == 200:
        #将获取的网页内容设置为“utf-8”编码格式
        response.encoding='utf-8'
        #返回下载网页内容
        return response

# 解析网页
def analysisHtml(html_text):
    # 使用XPath进行解析
    html= etree.HTML(html_text)

    # 获取
    all_li =html.xpath("//div[@id='content']//li")
    # print(all_li)
    result=[]
    for li in all_li[0:20]:
        # /html/body/div[3]/div[1]/div/div[1]/ul/li[1]/div[2]/h2/a
      bookname = li.xpath(".//div[2]/h2/a/text()")[0].strip()
      href = li.xpath(".//h2/a/@href")[0].strip()
    #   print(bookname)
    #   /html/body/div[3]/div[1]/div/div[1]/ul/li[1]/div[2]/p[2]/span[2]
      score=li.xpath(".//div[2]/p[2]/span[2]/text()")[0].strip()  
    #   print(score)
      bookstr=li.xpath(".//div[2]/p[1]/text()")[0].strip()
    #   /html/body/div[3]/div[1]/div/div[1]/ul/li[1]/div[2]/p[1]
      author=bookstr.split("/")[0]
      press=bookstr.split("/")[2] #出版社
      pubdate=bookstr.split("/")[1] #出版日期
    #   /html/body/div[3]/div[2]/div/div[1]/div[3]/div[1]/span[1]/div/p[1]
    #   print(press) 
    #   describ=li.xpath(".//p[3]/text()")[0].strip()
    #   result.append([bookname, score, author, press, pubdate, describ])
    #   print(li)
      describ=parse_desPage(download(href).text)[0:50]
      result.append([bookname, score, author, press, pubdate,describ[0:50]])
    # print(result)  
    return result

# 获取描述的信息
def parse_desPage(html):
    html = etree.HTML(html)
    list = []
    ps = html.xpath('//*[@id="link-report"]/span[1]/div/p')
    for p in ps[0:-1]:
      s=p.xpath('.//text()')
      list=list+s
    str = ','.join(list)
    return str

#获取连接数据库配置信息
def get_config(host,user,password,db):
    db_config = {
        #注意:安装MySQL服务器的IP,请以实际IP为主
        'host':host, 
         #注意:root为登录MySQL数据库的用户名
        'user':user, 
        #注意:root为用root用户登录MySQL数据库的登录密码
        'password':password, 
        #注意:python为访问的MySQL数据库
        'db':db
    }
    return db_config

def getConn(contents):
    #连接数据库
    db_config=get_config('localhost','root','tcx119','python3')
    conn = pymysql.Connect(**db_config)
    cur = conn.cursor()
    sql="insert into book(bookname,score,author,press,pubdate,describ) values(%s,%s,%s,%s,%s,%s)"
    for item in contents:
        cur.execute(sql,tuple(item))
    # for content in contents:
    #     # href=content['href']
    #     # title=content['title']
    #     # 拼接sql语句
    #     sql = "INSERT INTO book VALUES("+"'"+content[0]+"'"+","+"'"+content[1]+"'"+")"
    #     # 执行sql语句
    #     cur.execute(sql)
    conn.commit()
    #关闭数据库
    cur.close()
    conn.close()
    
# def saveTxt(contents):
#     #将解析后的数据保存到txt文件
#     with open('content.txt', 'w', encoding = 'utf-8') as f:   
#         for content in contents:
#             href=content[0]
#             title=content[1]
#             f.writelines(title +" :" + href + '\n')


if __name__ == "__main__":       
    html_text=download('https://book.douban.com/latest?icn=index-latestbook-all').text
    # contents=analysisHtml(html_text)
    getConn(analysisHtml(html_text))
    # saveTxt(analysisHtml(html_text))
    #saveExcel(contents)

BS4

code

# %%
from _typeshed import StrPath
import requests
from bs4 import BeautifulSoup

r=requests.get("http://www.baidu.com")
r.encoding = r.apparent_encoding

soup=BeautifulSoup(r.text,"lxml")
# %%
soup
# %%
soup.a
# %%
soup("a")
# %%
soup.find_all("a")
# %%
soup.a["href11"]

# %%
soup.a.get("href11")
# %%
soup("a")[-1].get("class")
# %%
soup("a")[-1].string
# %%
soup("a")[-1].get_text()
# %%
# 嵌套的文本要用 get_text 才能拿到
soup.div.get_text
# %%
soup.select("*")
# %%
soup.select("#cp")
# %%
soup.select("#cp,input")
# %%
# 有多个class 的 继续  .  就是
soup.select("#cp,input.bg.s_btn")
# %%
soup.select_one("input")

# %%
soup.a.attrs["href"]
# %%

爬取http://ccgp-shaanxi.gov.cn/上的公告


#http://ccgp-shaanxi.gov.cn/

from os import write
import requests
import csv
from bs4 import BeautifulSoup
r=requests.get("http://ccgp-shaanxi.gov.cn/")
r.encoding = r.apparent_encoding
soup=BeautifulSoup(r.text,"lxml")
soup
# %%
# soup.select("div.list-box")
# %%
rows=soup.select("#jdglprovincenotice > tr")
data = [[row.td.a.string,row.findAll("td")[-1].string] for row in rows]
# %%
print(data)
# %%
#保存data到csv 
with open("bs1.csv","w") as file:
    writer=csv.writer(file)
    writer.writerows(data)
print("data hava been saved compeletly")
# %%

推荐阅读