首页 > 技术文章 > python下载网页视频

ims- 2018-10-01 22:48 原文

因网站不同需要修改。

下载 mp4 连接

from bs4 import BeautifulSoup
import requests
import urllib
import re
import json
encodestyle = 'gbk'
homepage='http://www.**.html'
htmlhead='http://www.**'  # GetwVideoHtml() 函数用


#GetNPage_html(homepage,n)
#HtmlList2Mp4List(sumhtml)
#Writelist2json(listname,lists)

def GetwVideoHtml(furl):
    retlist=[]
    res = requests.get(furl)
    res.encoding= encodestyle
    soup = BeautifulSoup(res.text,'html.parser')
    for Tag_contentpage in soup.select('.video_box'):   #<div class= video_box>
        for tag_a in Tag_contentpage.select('a'):       #<a href = 'http-html' target='_blank'>
            httphtml=tag_a['href']
            retlist.append(htmlhead+httphtml)  # use htmlhead
            #print(imgsrc)
    return retlist
def GetNPage_html(homepage,n):
    rethtml=[]
    for num in range(1,n+1):
        if num == 1:
            homewebpage=homepage
        else:
            homewebpage= homepage.rsplit('.',1)[0] + '_'+ str(num) + '.html'
        print(homewebpage)
        htmllinks = GetwVideoHtml(homewebpage)
        rethtml = rethtml + htmllinks
    return rethtml

def GetMp4SrcFromHtml(url):
    headers = ("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36")
    opener = urllib.request.build_opener()
    opener.addheaders = [headers]
    urllib.request.install_opener(opener)
    file = urllib.request.urlopen(url).read()
    file = file.decode('gbk')
    pattern = re.compile(r'(https?://.*.mp4)', re.I)  
    videolinks = pattern.findall(file)
    videolinks = list(set(videolinks))
    return videolinks
def HtmlList2Mp4List(sumhtml):
    retmp4s = []
    for html in sumhtml:
        mp4s = GetMp4SrcFromHtml(html)
        for mp4 in mp4s:
            retmp4s.append(mp4)
    return retmp4s
def Writelist2json(listname,lists):  
    length = str(len(lists)) 
    with open('D:/ipynb/commfile/'+ listname + '_len_'+length +'.json', 'w') as fw:
        json.dump(lists, fw)
        
sumhtml = GetNPage_html(homepage,3)
mp4list = HtmlList2Mp4List(sumhtml)
Writelist2json("mp4list",mp4list)

下载部分

from bs4 import BeautifulSoup
import requests
import urllib
import json
import threading
import datetime
import os

def mkdir(path):
    folder = os.path.exists(path)
    if not folder:                   #判断是否存在文件夹如果不存在则创建为文件夹
        os.makedirs(path)            #makedirs 创建文件时如果路径不存在会创建这个路径
        print ("---  new folder...  ---")
        print ("---  OK  ---")
    else:
        print ("---  There is this folder!  ---")

def Schedule(a,b,c):
    '''
    回调函数:用于显示下载进度
    a:已经下载的数据块
    b:数据块的大小
    c:远程文件的大小
   '''
    per = 100.0 * a * b / c
    if (per > 100) :
        per = 100
    print ('%.2f%%' % per)
def createdownloadlink(name,url):
    urllib.request.urlretrieve(url,name)

class myThread (threading.Thread):
    def __init__(self, name, url):
        threading.Thread.__init__(self) # 线程初始化
        self.name = name  # 赋值成员变量
        self.url = url
    def run(self):
        print ("开始下载:" + self.name)
        urllib.request.urlretrieve(self.url,self.name)
        #createdownloadlink(self.name, self.url)  # 在线程中运行的函数
        print ("完成下载:" + self.name)
def DownMp4file(lists):
    dateASfolder=datetime.datetime.now().strftime('%m-%d')
    foldername = 'D:/videos/'+dateASfolder
    mkdir( foldername)    
    threadlist = [];#存放线程的数组,相当于线程池
    filenum=0
    for url in lists:  
        filename = foldername + '/'+ str(filenum)+ '.mp4'   
        filenum=filenum+1
        thread = myThread(filename, url)  # 创建线程对象
        threadlist.append(thread)        #这个线程放到线程threads
    return threadlist
# 执行部分


with open('D:/ipynb/commfile/srcmp4s_len_66.json', 'r') as fr:
    srcmp4s = json.load(fr)

print(len(srcmp4s))
srcmp4s[0]

threads= DownMp4file(srcmp4s)

for t in threads[:10]:#让线程池中的所有数组开始
    t.start(); 
for t in threads[:10]:
    t.join();#等待所有线程运行完毕才执行一下的代码

推荐阅读