首页 > 解决方案 > 使用 Python 批量下载

问题描述

我是一个基本的 python 编码器,我正在尝试创建一个 python 程序,我可以在其中插入来自特定网站的 url。它将通过网站上的各种链接并从该特定网站下载所有 pdf 文件。我用的是谷歌浏览器。我在互联网上搜索了一些很好的方法来做到这一点,然后我撕掉了这个特定的代码。我在 PyCharm 上尝试了代码。

def getPageHtmlSourceCode(url):
    try:
        import urllib2

        request_headers = {
            "Accept-Language": "en-US,en;q=0.5",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Referer": "http://thewebsite.com",
            "Connection": "keep-alive"
        }

        request = urllib2.Request(url, headers=request_headers)
        return urllib2.urlopen(request).read()
    except:
        return "error"

def inputUrl():
    url = "http://www.4ono.com/cbse-12th-science-previous-year-question-papers-pdf-201617/"
    htmlSourceCode = getPageHtmlSourceCode(url);
    if htmlSourceCode != "error":
        return htmlSourceCode
    print("\nCouldn't connect to web, please check the url entered or try again later\n")

def crawlPage(htmlSourceCode):
    start = 0
    while (1):
        subjectNameStart = htmlSourceCode.find('<h2 id="', start)
        if subjectNameStart < 0:
            break
        subjectNameEnd = htmlSourceCode.find('"', subjectNameStart + 8)
        subjectName = htmlSourceCode[subjectNameStart + 8:subjectNameEnd - 1]
        # my_file = open("output.txt", "a")
        # my_file.write(subjectName + '\n')
        # my_file.close()
        # print(subjectName)
        newSubSection = htmlSourceCode.find('<h2 id="', subjectNameEnd)
        subSectionEnd = subjectNameEnd
        while (subSectionEnd < newSubSection):
            stateStart = htmlSourceCode.find('<p><strong>', subjectNameEnd)
            stateEnd = htmlSourceCode.find('</strong>', stateStart + 11)
            state = htmlSourceCode[stateStart + 11:stateEnd]
            subjectNameEnd = stateEnd
            if subSectionEnd > newSubSection:
                break
            # my_file = open("output.txt", "a")
            # my_file.write(state + '\n')
            # my_file.close()
            newStateSection = htmlSourceCode.find('<p><strong>', stateEnd + 7)
            stateSectionEnd = stateEnd
            while (1):
                pdfLinkStart = htmlSourceCode.find('<a href="', stateEnd)
                subSectionEnd = pdfLinkStart
                pdfLinkEnd = htmlSourceCode.find('.pdf">', pdfLinkStart + 9)
                pdfLink = htmlSourceCode[pdfLinkStart + 9:pdfLinkEnd + 4]
                yearEnd = htmlSourceCode.find("</a>", pdfLinkEnd + 4)
                year = htmlSourceCode[pdfLinkEnd + 6:yearEnd]
                stateEnd = yearEnd
                stateSectionEnd = yearEnd
                pdfName = subjectName + '-' + state + '-' + year + '.pdf'
                import urllib
                urllib.urlretrieve(pdfLink, pdfName)
                if stateSectionEnd > newStateSection:
                    break
            # my_file = open("output.txt", "a")
            # my_file.write(year + '\n')
            # my_file.write(pdfLink + '\n')
            # my_file.close()
            # print(pdfLink)
            # print(year)
        start = subjectNameEnd

def lookUp():
    htmlSourceCode = inputUrl()
    crawlPage(htmlSourceCode)

代码结果如下:

"C:\Users\Pyro\PycharmProjects\PDF Downloader\venv\Scripts\python.exe" C:/Users/Pyro/.PyCharmCE2019.1/config/scratches/scratch.py

Process finished with exit code 0

标签: pythonpython-3.xpdfurldownload

解决方案


您必须调用该lookup()函数才能使其工作。在代码的最后添加以下行。

lookup()

推荐阅读