首页 > 解决方案 > 使用 urllib2 从 Internet 下载文件

问题描述

我正在尝试从 Internet 下载文件以进行研究,但是当我尝试从 python2 移动到 python3 时。我收到一个错误TypeError: a bytes-like object is required, not 'str',因为 python3 对待字符串不同,但我必须从 to 更改.content.text并且它已修复,但它不enter code here下载文件,但它正在抓取它们,我该如何强制下载它们?

def downloadFile(self, url):
    fDir=self.outputDir
    local_file = None
    if not os.path.isdir(fDir):
        os.makedirs(fDir)

    try:
        f = urllib.request.urlopen(url, timeout=10)

        for x in range(len(self.signature)):
            if ord(f.read(1))!=self.signature[x]:
                f.close()
                raise          

        local_file=open("%s/file%08d.%s" % (fDir, self.successCount, self.extension), "wb")
        for x in range(len(self.signature)):
            local_file.write(chr(self.signature[x]))
        local_file.write(f.read())
        local_file.close()
        f.close()           
    except KeyboardInterrupt:
        raise
    except:
        if local_file != None:
            local_file.close()
        for x in range(10):
            try:
                if os.path.isfile("%s/file%08d.%s" % (fDir, self.successCount, self.extension)): 
                    os.remove("%s/file%08d.%s" % (fDir, self.successCount, self.extension))
                break
            except:
                if x==9:
                    raise
                time.sleep(1)
        return
    self.successCount += 1
def search(self):
    if self.extension == None or self.extension == "":
        print("ERROR: No extension specified!")
        return         

    if len(self.signature) == 0:
        print("WARNING: No signature specified - THERE WILL BE LOT OF FALSE RESULTS :(")

    print("Starting with search")
    print("---------------------")
    print("Extension: " + self.extension)
    print("Signature: " + self.signatureText())
    print("Starting search base: " + self.searchCharsText())
    print("Output dir: " + self.outputDir)
    print("Max results per search: " + str(self.maxPerSearch))

    self.searchReal("")     
pos=r.text.find('<a href="')
while pos != -1:
    pos2_a=r.text.find('"', pos+16)
    pos2_b=r.text.find('&amp;', pos+16)
    if pos2_a == -1:
        pos2 = pos2_b
    elif pos2_b == -1:
        pos2 = pos2_a
    else:
        pos2 = min (pos2_a, pos2_b)
    if pos2 == -1:
        break;
    url = r.text[pos+16:pos2]
    if url.find('.google.') == -1 and url.startswith('http'):
        blocked = False
        if url not in self.downloaded:
            self.downloadFile(url)
            self.downloaded.append(url)
            f.write(url + "\n")

    pos_a=r.text.find('<a href="', pos+1)
    pos_b=r.text.find('a href="/url?q=', pos+1)
    if pos_a == -1:
        pos = pos_b
    elif pos_b == -1:
        pos = pos_a
    else:
        pos=min(pos_a, pos_b)

日志

http://www.aamalaysia.org/pdf/p-1_thisisaa1.pdf
https://www.deanza.edu/articulation/documents/ge-aa-as-dac.pdf
https://aamexico.org.mx/media/Lista_de_precios_%2520vigentes.pdf
https://www.aflglobal.com/productlist/Product-Lines/Conductor-Accessories/230kV-Aluminum-Welded-Bus-Pipe-Supports/doc/230kv-aluminum-welded-bus-supports.aspx

标签: pythonpython-requestsurllib

解决方案


看起来你有一些额外的代码用于你自己的目的..但如果它有帮助,从互联网下载文件可以很简单:

import urllib.request
url = 'http://www.aamalaysia.org/pdf/p-1_thisisaa1.pdf'
out_file = 'file.pdf'
data = urllib.request.urlopen(url).read()
with open(out_file,'wb') as out:
    out.write(data)

推荐阅读