首页 > 技术文章 > 用Python实现www网站数据下载(多线程实现代码)

sky-of-chuanqingchen 2013-10-29 17:47 原文

'''
Created on 2013-9-4   @author: sky

'''
#!/usr/bin/env python
from urllib import urlopen
from HTMLParser import HTMLParser
import threading
import time
import re 
'''
HTML分析代码,实现HTML代码的解析功能
'''
class Scraper(HTMLParser):
in_link=False
def handle_starttag(self,tag,attrs):
attrs=dict(attrs)
if tag=='a' and 'href' in attrs:
self.in_link=True
self.chunks=[]
self.url=attrs['href']
def handle_data(self,data):
if self.in_link:
self.chunks.append(data)
def handle_endtag(self,tag):
global i
# pat=re.compile('http://(.*)')
# m=pat.match(self.url)
if tag=='a':
if self.in_link:
i=i+1
urladdress[i]=self.url

# print '%s:%s' % (i,count[i])
# print '%s' % self.url 
self.in_link=False
'''
调用URLOPEN模块和HTMLParser模块,实现HTML网页数据的下载
'''
def crawler():
global newurl,filename
print '%s:::::::%s' % (filename,newurl)
text=urlopen(newurl).read()
f=open(filename,'w')
f.write(text)
parser=Scraper()
parser.feed(text)
parser.close()
'''
程序主函数main,程序的入口函数,通过多线程实现并行处理
'''
i=0
newurl='http://www.sina.com.cn'
filename='/var/crawler/xinanews/'+str(i)
count=1
urladdress={}
t=threading.Thread(target=crawler,name=i)
t.setDaemon(1)
t.start() 
t.join()
print i
pat=re.compile('http://(.*)')
while 1:
for j in range(1,i):
m=pat.match(urladdress[j])
if m:
count=count+1
newurl=urladdress[j]
filename='/var/crawler/xinanews/'+str(count) 
print '%s:%s' % (newurl,filename)
k=threading.Thread(target=crawler,name=i)
k.setDaemon(1)
k.start() 
k.join()

 

推荐阅读