首页 > 技术文章 > 第六次作业

kkk-home 2020-12-01 22:45 原文

作业1:

用requests和BeautifulSoup库方法爬取豆瓣电影Top250数据。

代码如下:


from bs4 import BeautifulSoup
import threading
import re
import requests
import urllib.request
import pymysql


def get_html(url):
    res = requests.get(url, headers=headers)
    res.encoding = res.apparent_encoding
    html = res.text
    prase(html)


def prase(html):
    urls = []
    soup = BeautifulSoup(html, "html.parser")
    movies = soup.find('ol')
    movies = movies.find_all('li')
    for i in movies:
        try:
            # 将mNo转换成string类型
            mNo = i.em.string
            # 电影名
            mName = i.find('span').text
            '''# 利用正则匹配出图片网址
            img = str(i.find('img'))
            # 此时返回的img_url为列表类型
            img_url = re.findall(r'src="(.*?)"', img)
            img_url = img_url[0]
            # 将图片下载保存,此时下载为单线程
            download(mName, img_url)'''
            # 导演、主演、上映年份、国家、类型等信息存在于同一子节点中
            info = i.find('p').text
            # 导演
            director = re.findall(r'导演: (.*?) ', info)
            # 主演
            actor = re.findall(r'主演: (.*?) ', info)
            # 将第二行的信息分割后存入数组
            array = re.findall(r'\d+.+', info)[0].split('/')
            # strip()的作用是去除空格
            # 上映年份
            time = array[0].strip()
            # 国家
            country = array[1].strip()
            # 类型
            type = array[2].strip()
            # 评分
            score = i.find('span', attrs={"class": "rating_num"}).text
            # 通过next_sibling查找”评分“的兄弟节点获取“评价人数”
            num = i.find('span', attrs={"class": "rating_num"}).next_sibling.next_sibling.next_sibling.next_sibling.text
            # 引用
            quote = i.find('span', attrs={"class": "inq"}).text
            # 文件路径
            mFile = str(mName) + ".jpg"
            cursor.execute("insert into movie(mNo,mName,director,actor,time,country,type,score,num,quote,mFile) "
                           "values( % s, % s, % s, % s, % s, % s, % s, % s, % s, % s, % s)",
                           (mNo, mName, director, actor, time, country, type, score, num, quote, mFile))
        except Exception:
            pass
    # 查找页面内所有图片信息
    images = soup.select("img")
    for image in images:
        try:
            # 找出图片对应网址
            url = image['src']
            # 找出对应电影名
            mName = image['alt']
            if url not in urls:
                T = threading.Thread(target=download, args=(mName, url))
                T.setDaemon(False)
                T.start()
                threads.append(T)
        except Exception as err:
            print(err)


def download(pic_name, img):
    req = urllib.request.Request(img)
    data = urllib.request.urlopen(req, timeout=100)
    data = data.read()
    fobj = open("D:/data/file2/" + str(pic_name) + ".jpg", "wb")
    fobj.write(data)
    fobj.close()


url = 'https://movie.douban.com/top250'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'}
# 连接数据库
con = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='123456', db='mydb', charset='utf8')
cursor = con.cursor(pymysql.cursors.DictCursor)
sql = 'create table movie(mNo int,mName varchar(32), director varchar(32),actor varchar(32),time varchar(32),' \
      'country varchar(16),type varchar(32),score varchar(32),num varchar(32),quote varchar(32),mFile varchar(32));'
cursor.execute(sql)
cursor.execute("delete from movie")

threads = []
for page in range(0, 11):
    get_html("https://movie.douban.com/top250?start="+str(page*25)+"&filter=")
for t in threads:
    t.join()
# 关闭连接
con.commit()
con.close()

运行结果

  • 数据库存储
  • 文件夹图片

心得体会

1.温故知新,重新学习了re和beautiful soup的使用方法(不重新学都忘光了hhh),之前遇到的一些疑惑也能理解了。对beautiful soup的查找兄弟方法有了初步的理解,但不明白为啥查找相邻的节点需要两次的next_sibling。
2.爬取图片时是查找所有的img src,可能会出现下载与电影无关图片的问题。原本的想法是创建一个空列表存储所有图片url后在进行爬取,但是这样不好命名,希望下次能找到更好的方法。
3.对多线程的运行机制理解还不够透彻,知其然不知其所以然,还要再多看看。关于本次数据的存储本来不想用mysql,但是用prettytable的时候出现了超过迭代次数的报错,进程终止。还是屈服了

作业2:

Scrapy+Xpath+MySQL数据库存储技术路线爬取科软排名信息

代码如下:

  • college_bank

import scrapy
from bs4 import UnicodeDammit
import requests
import urllib
from ..items import CollegeInfoItem
from ..pipelines import CollegeInfoPipeline


class CollegeBankSpider(scrapy.Spider):
    name = 'college_bank'
    # allowed_domains = ['www.baidu.com.io_']

    def start_requests(self):
        url = 'https://www.shanghairanking.cn/rankings/bcur/2020'
        yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        # 解析网页
        dammit = UnicodeDammit(response.body, ["utf-8", "gbk"])
        data = dammit.unicode_markup
        selector = scrapy.Selector(text=data)
        lis = selector.xpath('//table[@class="rk-table"]//tbody[@data-v-2a8fd7e4]//tr')
        for li in lis:
            # sNo,sName,city三个信息可以在当前网页上查看到
            sNo = li.xpath('.//td[position()=1]/text()').extract_first().replace('\n', '').replace(' ', '')
            sName = li.xpath('.//td[position()=2]/a/text()').extract_first()
            city = li.xpath('.//td[position()=3]/text()').extract_first().replace('\n', '').replace(' ', '')
            # 此网页所能爬取到的url不是学校官网网址
            # officalUrl,info,logo要进入另一个网页
            url = li.xpath('.//td[@class="align-left"]/a/@href').extract_first()
            url = 'http://www.shanghairanking.cn' + url
            res = requests.get(url)
            res.encoding = res.apparent_encoding
            html = res.text
            selector = scrapy.Selector(text=html)
            # 注意官网和src是属性值
            officalUrl = selector.xpath('//div[@class="univ-website"]/a/@href').extract_first()
            info = selector.xpath('//div[@class="univ-introduce"]/p/text()').extract_first()
            # 获取到学校logo地址,进行下载
            logo = selector.xpath('//td[@class="univ-logo"]/img/@src').extract_first()
            req = urllib.request.Request(logo)
            data = urllib.request.urlopen(req, timeout=100)
            data = data.read()
            fobj = open("D:/data/file1/" + str(sNo) + ".png", "wb")
            fobj.write(data)
            fobj.close()
            print('#######################################')
            print(sNo)
            print(sName)
            print(city)
            print(officalUrl)
            print(info)
            print(str(sNo)+".png")
            print('**************************************')
            item = CollegeInfoItem()
            item['sNo'] = sNo
            item['sName'] = sName
            item['city'] = city
            item['officalUrl'] = officalUrl
            item['info'] = info
            item['mFile'] = str(sNo)+".png"
            yield item
        pass

  • items

import scrapy


class CollegeInfoItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    sNo = scrapy.Field()
    sName = scrapy.Field()
    city = scrapy.Field()
    officalUrl = scrapy.Field()
    info = scrapy.Field()
    mFile = scrapy.Field()
    pass
  • pipelines

import pymysql


class CollegeInfoPipeline:
    def open_spider(self, spider):
        try:
            self.con = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='123456',
                                       db='mydb', charset='utf8')
            self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
            self.sql = 'create table college(sNo int,sName varchar(32),city varchar(16),officalUrl varchar(64),' \
                       'info text,mFile varchar(16));'
            # self.cursor.execute(self.sql)
            self.cursor.execute('delete from college')
            self.opened = True
        except Exception as err:
            print(err)

    def close_spider(self, spider):
        if self.opened:
            self.con.commit()
            self.con.close()
            self.opened = False

    def process_item(self, item, spider):
        if self.opened:
            self.cursor.execute("insert into college (sNo,sName,city,officalUrl,info,mfile) values(%s,%s,%s,%s,%s,%s)",
                                (item['sNo'], item['sName'], item['city'], item['officalUrl'], item['info'], item['mFile']))
        return item

运行结果

  • 控制台输出

  • 数据库存储

  • 文件夹图片

心得体会

重新温习了一下scrapy框架的使用方法,加深了对selector的理解,本次作业要爬取的内容全都可以在一个页面中找到,不需要翻页,结合之前的知识不会感觉难度太大。时隔n天再次使用scrapy,对比发现selenium虽然说速度上不够快,但是真的方便啊(虽然可能更大一部分原因是scrapy太久没用了,忘光了)。

作业3:

使用Selenium框架+ MySQL数据库存储技术模拟登录慕课网,并获取学生自己账户中已学课程的信息并保存在MYSQL中。

代码如下:


from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import pymysql


def Load_in():
    time.sleep(1)
    # 点击登录
    user = driver.find_element_by_xpath('//*[@id="j-topnav"]/div')
    user.click()
    time.sleep(1)
    # 选择其他方式登录
    way = driver.find_element_by_xpath('//div[@class="ux-login-set-scan-code_ft"]/span')
    way.click()
    time.sleep(1)
    # 选择电话号码登录
    telephone = driver.find_element_by_xpath('//ul[@class="ux-tabs-underline_hd"]/li[2]')
    telephone.click()
    time.sleep(1)
    frame = driver.find_element_by_xpath("/html/body/div[13]/div[2]/div/div/div/div/div/div[1]/div/div[1]/div[2]/div[2]/div/iframe")
    # 将操作切换至页面弹窗
    driver.switch_to.frame(frame)
    driver.find_element_by_xpath('//input[@type="tel"]').send_keys('15260637027')
    time.sleep(1)
    driver.find_element_by_xpath('//input[@class="j-inputtext dlemail"]').send_keys('******')
    time.sleep(1)
    load_in = driver.find_element_by_xpath('//*[@id="submitBtn"]')
    load_in.click()


def MyClass():
    time.sleep(2)
    # 进入个人中心
    myclass = driver.find_element_by_xpath('//*[@id="j-indexNav-bar"]/div/div/div/div/div[7]/div[3]/div')
    myclass.click()
    all_spider()


def all_spider():
    time.sleep(1)
    spider()
    time.sleep(1)
    # 进行翻页的尝试
    try:
        driver.find_element_by_xpath('//ul[@class="ux-pager"]/li[@class="ux-pager_btn ux-pager_btn__next"]/a[@class="th-bk-disable-gh"]')
    except Exception:
        driver.find_element_by_xpath('//ul[@class="ux-pager"]/li[@class="ux-pager_btn ux-pager_btn__next"]/a[@class="th-bk-main-gh"]').click()
        all_spider()


def spider():
    global id
    time.sleep(1)
    lis = driver.find_elements_by_xpath('//div[@class="course-card-wrapper"]')
    print(lis)
    for li in lis:
        time.sleep(1)
        li.click()
        # 获取页面句柄,原网页为0,新网页为1
        window = driver.window_handles
        # 切换到新页面
        driver.switch_to.window(window[1])
        time.sleep(1)
        driver.find_element_by_xpath('//*[@id="g-body"]/div[3]/div/div[1]/div/a').click()
        # 重新获取句柄
        window = driver.window_handles
        # 切换到下一页面
        driver.switch_to.window(window[2])
        time.sleep(1)
        id += 1
        course = driver.find_element_by_xpath('//*[@id="g-body"]/div[1]/div/div[3]/div/div[1]/div[1]/span[1]').text
        teacher = driver.find_element_by_xpath('//*[@id="j-teacher"]//h3[@class="f-fc3"]').text
        collage = driver.find_element_by_xpath('//*[@id="j-teacher"]/div/a/img').get_attribute('alt')
        process = driver.find_element_by_xpath('//*[@id="course-enroll-info"]/div/div[1]/div[2]/div[1]').text
        count = driver.find_element_by_xpath('//*[@id="course-enroll-info"]/div/div[2]/div[1]/span').text
        brief = driver.find_element_by_xpath('//*[@id="j-rectxt2"]').text
        print(id)
        print(course)
        print(teacher)
        print(collage)
        print(process)
        print(count)
        print(brief)
        cursor.execute("insert into mooc(id, course, teacher, collage, process, count, brief) "
                       "values( % s, % s, % s, % s, % s, % s, % s)",
                       (id, course, teacher, collage, process, count, brief))
        time.sleep(1)
        # 关闭此窗口
        driver.close()
        # 切换回上一网页
        driver.switch_to.window(window[1])
        time.sleep(1)
        # 进入两次新网页,所以要进行两次close()操作
        driver.close()
        driver.switch_to.window(window[0])
        print('***********************************')


url = 'https://www.icourse163.org/'

chrome_options = Options()
'''chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')'''
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.get(url)
# 最大化浏览器窗口
driver.maximize_window()
# 连接数据库
con = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='123456', db='mydb', charset='utf8')
cursor = con.cursor(pymysql.cursors.DictCursor)
sql = 'create table mooc(id int,course varchar(32),teacher varchar(16),collage varchar(32),process varchar(64),' \
      'count varchar(64),brief text);'
'''cursor.execute(sql)'''
cursor.execute("delete from mooc")
# 登录
Load_in()
id = 0
# 爬取课程信息
MyClass()
# 关闭连接
con.commit()
con.close()

登录gif

运行结果

  • 终端运行结果
  • 数据库存储

心得体会

1.模拟登陆时,点击注册/登陆会弹出一个小弹窗,这个弹窗并不是新页面,需要通过driver.switch_to.frame进行切换。
2.这一次的作业需要在点开一个新界面的基础上再点开第二个网页,此时需要重新获取页面句柄,同时关闭也要关闭两次
ps:这次作业和上一次差不多,就是模拟点击课程时,突然弹出让我评价的窗口,害的程序直接报错,只要手动评价后就不会再弹出了,不评价会一直弹。。。

推荐阅读