首页 > 技术文章 > shell抓取

code-style 2014-04-14 21:39 原文

#!/bin/sh

dir=`dirname $0`
configDir="$dir/config"

ipport="$configDir/ip_port"

url="http://www.youdaili.cn/Daili/http/"
indexs=$(curl -s --max-time 200 "$url" |piconv -f utf8 -t gbk|awk '$0~/http:\/\/www.youdaili.cn\/static\/images\/hot.gif/{print substr($2,41,length($2)-46)}')

pages="$(curl -s --max-time 200  "${url}${indexs}.html"|piconv -f utf8 -t gbk|awk '$0~/共.*页/{page=gensub(/.*共([^页]+).*/,"\\1","1",$0);print page}')"

for((page=1;page<=$pages;page++))
do
        if [[ $page -eq 1  ]]
        then
                curl -s --max-time 200  "${url}${indexs}.html"|piconv -f utf8 -t gbk|awk '$0~/.*@HTTP#.*<br \/>/{gsub(".*<p>","",$0);gsub(".*<span>","",$0);gsub("@HTTP#.*","",$0);print}'
        else
                link="${url}${indexs}_$page.html"
                curl -s --max-time 200  "$link"|piconv -f utf8 -t gbk|awk '$0~/.*@HTTP#.*<br \/>/{gsub(".*<p>","",$0);gsub(".*<span>","",$0);gsub("@HTTP#.*","",$0);print}'
        fi
done | sort -u >$ipport

 

推荐阅读