首页 > 技术文章 > php curl 抓页面 css

xiao-chuan- 2022-04-09 23:04 原文

声明

本分享纯属为了技术分享,禁止商用!!!禁止商用!!!禁止商用!!!
未经本人允许,如有发现,违者必究!!!

调用

// css
$url='http://xxx.cn/';
$re=curl_css($url);
$re=json_decode($re,true);
if ($re==null || count($re)==0) {
    echo '获取错误';
    exit;
}
array_splice($re,2);
echo curl_downcss($re[0],dirname(__FILE__).DIRECTORY_SEPARATOR.'css'.DIRECTORY_SEPARATOR,'GET','1'); // 单文件

// 多文件遍历
//foreach ($re as $v) {
//    echo curl_downcss($v,dirname(__FILE__).DIRECTORY_SEPARATOR.'css'.DIRECTORY_SEPARATOR,'GET','1');
//}

// 批量
//curl_downcss_multi($re,dirname(__FILE__).DIRECTORY_SEPARATOR.'css'.DIRECTORY_SEPARATOR,'GET','1');

方法

// ============ css ============

function curl_css($url='') {
    $ch=curl_init();
    $array=array(
        CURLOPT_URL => $url,
        CURLOPT_ENCODING => 'gzip,deflate',
        CURLOPT_SSL_VERIFYPEER => 0,
        CURLOPT_SSL_VERIFYHOST => 0,
        CURLOPT_RETURNTRANSFER => 1,
        CURLOPT_FOLLOWLOCATION => 1,
        CURLOPT_HTTPHEADER => array(
            'pragma: no-cache',
            'cache-control: no-cache',
            'sec-ch-ua: " Not A;Brand";v="99", "Chromium";v="96", "Google Chrome";v="96"',
            'accept: application/json, text/plain, */*',
            'content-type: application/json',
            'sec-ch-ua-mobile: ?0',
            'user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36',
            'sec-ch-ua-platform: "Windows"',
            'sec-fetch-site: same-origin',
            'sec-fetch-mode: cors',
            'sec-fetch-dest: empty',
            'accept-language: zh-CN,zh;q=0.9'
        )
    );
    curl_setopt_array($ch,$array);
    $output=curl_exec($ch);
    if (curl_errno($ch)) {
        return curl_error($ch);
    }
    curl_close($ch);
    preg_match_all('/<link(.*) href="(\S+\.css)"/',$output,$arr); // 规则自己根据实际情况定义
    $items=$arr[2];
    $url_main=parse_url($url)['scheme'].'://'.parse_url($url)['host'];
    array_walk($items,function(&$item) use ($url_main){
        $item = trim(substr($item,0,2)=='//'?substr($item,2):(substr($item,0,1)=='/'?$url_main.$item:$item));
        return $item;
    });
    $items=array_filter($items);
    $items=array_unique($items);
    return json_encode($items,320);

}

/**
 * @param $url
 * @param $dir
 * @param $method
 * @param $type 命名规则 1=源文件名 2=随机命名,不会重复
 * @return string
 */
function curl_downcss($url='',$dir='',$method='GET',$type='1') {
    if (!is_dir($dir)) {
        mkdir($dir,0777,true);
    }
    $ext=pathinfo($url)['extension']?pathinfo($url)['extension']:'css';
    $file_path=$type=='1'?$dir.pathinfo($url)['filename'].'.'.$ext:$dir.sha1(md5(microtime(true).mt_rand(1,100000).mt_rand(1,100000))).'.'.$ext;
    $ch=curl_init();
    $fp=fopen($file_path,'w');
    $arr=array(
        CURLOPT_URL => $url,
        CURLOPT_CUSTOMREQUEST => strtoupper($method),
//        CURLOPT_PROGRESSFUNCTION => 'progressCallback',
//        CURLOPT_NOPROGRESS => 0,
        CURLOPT_HEADER => 0,
        CURLOPT_SSL_VERIFYPEER => 0,
        CURLOPT_SSL_VERIFYHOST => 0,
        CURLOPT_CONNECTTIMEOUT => 10,
        CURLOPT_FOLLOWLOCATION => 1,
        CURLOPT_FILE => $fp,
    );
    curl_setopt_array($ch,$arr);
    $output=curl_exec($ch);
    $size=filesize($file_path);
    $info=curl_getinfo($ch);
    if (curl_errno($ch)) {
        var_dump(curl_errno($ch));
        exit;
        fclose($fp);
        unlink($file_path);
        return curl_error($ch);
    } elseif ($info['http_code'] != '200' || $size != $info['size_download']) {
        fclose($fp);
        unlink($file_path);
        return '数据不完整';
    }
    return 'ok';
}

/**
 * @param $arrs
 * @param $dir
 * @param $method
 * @param $type 命名规则 1=源文件名 2=随机命名,不会重复
 * @return void
 */
function curl_downcss_multi($arrs=array(),$dir='',$method='GET',$type='1') {
    if (!is_dir($dir)) {
        mkdir($dir,0777,true);
    }
    $conn=array();
    $file_path=array();
    $fp=array();
    $mh=curl_multi_init();
    foreach ($arrs as $k=>$v) {
        $ext=pathinfo($v)['extension']?pathinfo($v)['extension']:'css';
        $file_path[$k]=$type=='1'?$dir.pathinfo($v)['filename'].'.'.$ext:$dir.sha1(md5(microtime(true).mt_rand(1,100000).mt_rand(1,100000))).'.'.$ext;
        $conn[$k]=curl_init();
        $fp[$k]=fopen($file_path[$k],'w');
        $arr=array(
            CURLOPT_URL => $v,
            CURLOPT_CUSTOMREQUEST => strtoupper($method),
//            CURLOPT_PROGRESSFUNCTION => 'progressCallback',
//            CURLOPT_NOPROGRESS => 0,
            CURLOPT_HEADER => 0,
            CURLOPT_SSL_VERIFYPEER => 0,
            CURLOPT_SSL_VERIFYHOST => 0,
            CURLOPT_FOLLOWLOCATION => 1,
            CURLOPT_CONNECTTIMEOUT => 60,
            CURLOPT_TIMEOUT => 60,
            CURLOPT_RETURNTRANSFER => 1,
            CURLOPT_FILE => $fp[$k],
        );
        curl_setopt_array($conn[$k],$arr);
        curl_multi_add_handle($mh,$conn[$k]);
    }
    $active = null;
    do {
        curl_multi_exec($mh, $active);
        static $i=0;
        static $ok=0;
        while ($done=curl_multi_info_read($mh)) {
            if (curl_errno($done['handle'])) {
                fclose($fp[$i]);
                unlink($file_path[$i]);
                curl_multi_remove_handle($mh,$done['handle']);
                curl_close($done['handle']);
                continue;
            }
            $info=curl_getinfo($done['handle']);
            $size=filesize($file_path[$i]);
            if ($info['http_code'] != '200') {
                fclose($fp[$i]);
                unlink($file_path[$i]);
            }
            curl_multi_remove_handle($mh,$done['handle']);
            curl_close($done['handle']);
            ++$i;
            ++$ok;
        }
    } while ($active > 0);
    curl_multi_close($mh);
    echo 'ok: '.$ok;
}

推荐阅读