首页 > 解决方案 > 无法使用 BeautifulSoup、Selenium 获取网页的所有 HTML 内容

问题描述

我正在尝试从此网页中提取端口的所有位置:https ://directories.lloydslist.com/var/recordset/65237/pos/11

我从使用 BeautifulSoup 的基本代码开始:

    url ='https://directories.lloydslist.com/var/recordset/65237/pos/11'
    result = requests.get(url)
    c = result.content
    soup = BeautifulSoup(c)
    print(soup)

输出:

<!DOCTYPE html>
<!-- CMS Strata (c)2003-2020 AMA DataSet Limited (www.ama.uk.com) [CONTENT MANAGEMENT SYSTEM 0.9.40-alpha-lloyds-cms11-lloyds_list_2002] start:0.150--><html lang="en" xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/><script type="text/javascript">(window.NREUM||(NREUM={})).loader_config={licenseKey:"154740f801",applicationID:"14826849"};window.NREUM||(NREUM={}),__nr_require=function(n,e,t){function r(t){if(!e[t]){var i=e[t]={exports:{}};n[t][0].call(i.exports,function(e){var i=n[t][1][e];return r(i||e)},i,i.exports)}return e[t].exports}if("function"==typeof __nr_require)return __nr_require;for(var i=0;i<t.length;i++)r(t[i]);return r}({1:[function(n,e,t){function r(){}function i(n,e,t){return function(){return o(n,[u.now()].concat(f(arguments)),e?null:this,t),e?void 0:this}}var o=n("handle"),a=n(4),f=n(5),c=n("ee").get("tracer"),u=n("loader"),s=NREUM;"undefined"==typeof window.newrelic&&(newrelic=s);var p=["setPageViewName","setCustomAttribute","setErrorHandler","finished","addToTrace","inlineHit","addRelease"],d="api-",l=d+"ixn-";a(p,function(n,e){s[e]=i(d+e,!0,"api")}),s.addPageAction=i(d+"addPageAction",!0),s.setCurrentRouteName=i(d+"routeName",!0),e.exports=newrelic,s.interaction=function(){return(new r).get()};var m=r.prototype={createTracer:function(n,e){var t={},r=this,i="function"==typeof e;return o(l+"tracer",[u.now(),n,t],r),function(){if(c.emit((i?"":"no-")+"fn-start",[u.now(),r,i],t),i)try{return e.apply(this,arguments)}catch(n){throw c.emit("fn-err",[arguments,this,n],t),n}finally{c.emit("fn-end",[u.now()],t)}}}};a("actionText,setName,setAttribute,save,ignore,onEnd,getContext,end,get".split(","),function(n,e){m[e]=i(l+e)}),newrelic.noticeError=function(n,e){"string"==typeof n&&(n=new Error(n)),o("err",[n,u.now(),!1,e])}},{}],2:[function(n,e,t){function r(n,e){var t=n.getEntries();t.forEach(function(n){"first-paint"===n.name?a("timing",["fp",Math.floor(n.startTime)]):"first-contentful-paint"===n.name&&a("timing",["fcp",Math.floor(n.startTime)])})}function i(n){if(n instanceof c&&!s){var e,t=Math.round(n.timeStamp);e=t>1e12?Date.now()-t:f.now()-t,s=!0,a("timing",["fi",t,{type:n.type,fid:e}])}}if(!("init"in NREUM&&"page_view_timing"in NREUM.init&&"enabled"in NREUM.init.page_view_timing&&NREUM.init.page_view_timing.enabled===!1)){var o,a=n("handle"),f=n("loader"),c=NREUM.o.EV;if("PerformanceObserver"in window&&"function"==typeof window.PerformanceObserver){o=new PerformanceObserver(r);try{o.observe({entryTypes:["paint"]})}catch(u){}}if("addEventListener"in document){var s=!1,p=["click","keydown","mousedown","pointerdown","touchstart"];p.forEach(function(n){document.addEventListener(n,i,!1)})}}},{}],3:[function(n,e,t){function r(n,e){if(!i)return!1;if(n!==i)return!1;if(!e)return!0;if(!o)return!1;for(var t=o.split("."),r=e.split("."),a=0;a<r.length;a++)if(r[a]!==t[a])return!1;return!0}var i=null,o=null,a=/Version\/(\S+)\s+Safari/;if(navigator.userAgent){var f=navigator.userAgent,c=f.match(a);c&&f.indexOf("Chrome")===-1&&f.indexOf("Chromium")===-1&&(i="Safari",o=c[1])}e.exports={agent:i,version:o,match:r}},{}],4:[function(n,e,t){function r(n,e){var t=[],r="",o=0;for(r in n)i.call(n,r)&&(t[o]=e(r,n[r]),o+=1);return t}var i=Object.prototype.hasOwnProperty;e.exports=r},{}],5:[function(n,e,t){function r(n,e,t){e||(e=0),"undefined"==typeof t&&(t=n?n.length:0);for(var r=-1,i=t-e||0,o=Array(i<0?0:i);++r<i;)o[r]=n[e+r];return o}e.exports=r},{}],6:[function(n,e,t){e.exports={exists:"undefined"!=typeof window.performance&&window.performance.timing&&"undefined"!=typeof window.performance.timing.navigationStart}},{}],ee:[function(n,e,t){function r(){}function i(n){function e(n){return n&&n instanceof r?n:n?c(n,f,o):o()}function t(t,r,i,o){if(!d.aborted||o){n&&n(t,r,i);for(var a=e(i),f=v(t),c=f.length,u=0;u<c;u++)f[u].apply(a,r);var p=s[y[t]];return p&&p.push([b,t,r,a]),a}}function l(n,e){h[n]=v(n).concat(e)}function m(n,e){var t=h[n];if(t)for(var r=0;r<t.length;r++)t[r]===e&&t.splice(r,1)}function v(n){return h[n]||[]}function g(n){return p[n]=p[n]||i(t)}function w(n,e){u(n,function(n,t){e=e||"feature",y[t]=e,e in s||(s[e]=[])})}var h={},y={},b={on:l,addEventListener:l,removeEventListener:m,emit:t,get:g,listeners:v,context:e,buffer:w,abort:a,aborted:!1};return b}function o(){return new r}function a(){(s.api||s.feature)&&(d.aborted=!0,s=d.backlog={})}var f="nr@context",c=n("gos"),u=n(4),s={},p={},d=e.exports=i();d.backlog=s},{}],gos:[function(n,e,t){function r(n,e,t){if(i.call(n,e))return n[e];var r=t();if(Object.defineProperty&&Object.keys)try{return Object.defineProperty(n,e,{value:r,writable:!0,enumerable:!1}),r}catch(o){}return n[e]=r,r}var i=Object.prototype.hasOwnProperty;e.exports=r},{}],handle:[function(n,e,t){function r(n,e,t,r){i.buffer([n],r),i.emit(n,e,t)}var i=n("ee").get("handle");e.exports=r,r.ee=i},{}],id:[function(n,e,t){function r(n){var e=typeof n;return!n||"object"!==e&&"function"!==e?-1:n===window?0:a(n,o,function(){return i++})}var i=1,o="nr@id",a=n("gos");e.exports=r},{}],loader:[function(n,e,t){function r(){if(!x++){var n=E.info=NREUM.info,e=l.getElementsByTagName("script")[0];if(setTimeout(s.abort,3e4),!(n&&n.licenseKey&&n.applicationID&&e))return s.abort();u(y,function(e,t){n[e]||(n[e]=t)}),c("mark",["onload",a()+E.offset],null,"api");var t=l.createElement("script");t.src="https://"+n.agent,e.parentNode.insertBefore(t,e)}}function i(){"complete"===l.readyState&&o()}function o(){c("mark",["domContent",a()+E.offset],null,"api")}function a(){return O.exists&&performance.now?Math.round(performance.now()):(f=Math.max((new Date).getTime(),f))-E.offset}var f=(new Date).getTime(),c=n("handle"),u=n(4),s=n("ee"),p=n(3),d=window,l=d.document,m="addEventListener",v="attachEvent",g=d.XMLHttpRequest,w=g&&g.prototype;NREUM.o={ST:setTimeout,SI:d.setImmediate,CT:clearTimeout,XHR:g,REQ:d.Request,EV:d.Event,PR:d.Promise,MO:d.MutationObserver};var h=""+location,y={beacon:"bam.nr-data.net",errorBeacon:"bam.nr-data.net",agent:"js-agent.newrelic.com/nr-1158.min.js"},b=g&&w&&w[m]&&!/CriOS/.test(navigator.userAgent),E=e.exports={offset:f,now:a,origin:h,features:{},xhrWrappable:b,userAgent:p};n(1),n(2),l[m]?(l[m]("DOMContentLoaded",o,!1),d[m]("load",r,!1)):(l[v]("onreadystatechange",i),d[v]("onload",r)),c("mark",["firstbyte",f],null,"api");var x=0,O=n(6)},{}],"wrap-function":[function(n,e,t){function r(n){return!(n&&n instanceof Function&&n.apply&&!n[a])}var i=n("ee"),o=n(5),a="nr@original",f=Object.prototype.hasOwnProperty,c=!1;e.exports=function(n,e){function t(n,e,t,i){function nrWrapper(){var r,a,f,c;try{a=this,r=o(arguments),f="function"==typeof t?t(r,a):t||{}}catch(u){d([u,"",[r,a,i],f])}s(e+"start",[r,a,i],f);try{return c=n.apply(a,r)}catch(p){throw s(e+"err",[r,a,p],f),p}finally{s(e+"end",[r,a,c],f)}}return r(n)?n:(e||(e=""),nrWrapper[a]=n,p(n,nrWrapper),nrWrapper)}function u(n,e,i,o){i||(i="");var a,f,c,u="-"===i.charAt(0);for(c=0;c<e.length;c++)f=e[c],a=n[f],r(a)||(n[f]=t(a,u?f+i:i,o,f))}function s(t,r,i){if(!c||e){var o=c;c=!0;try{n.emit(t,r,i,e)}catch(a){d([a,t,r,i])}c=o}}function p(n,e){if(Object.defineProperty&&Object.keys)try{var t=Object.keys(n);return t.forEach(function(t){Object.defineProperty(e,t,{get:function(){return n[t]},set:function(e){return n[t]=e,e}})}),e}catch(r){d([r])}for(var i in n)f.call(n,i)&&(e[i]=n[i]);return e}function d(e){try{n.emit("internal-error",e)}catch(t){}}return n||(n=i),t.inPlace=u,t.flag=a,t}},{}]},{},["loader"]);</script>
<meta content="blendTrans(Duration=0.1)" http-equiv="Page-Enter"/>
<meta content="blendTrans(Duration=0.1)" http-equiv="Page-Exit"/>
<meta content="support@ama.uk.com" http-equiv="Reply-to"/>
<meta content="CMS Strata - AMA DataSet Limited" name="generator"/>
<meta content="" name="description"/>
<meta content="" name="keywords"/>
<meta content="index, follow" name="robots"/>
<meta content="http://directories.lloydslist.com/var/recordset/65237/pos/11" property="og:url"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<title>Lloyd's List - Directories</title>
<link href="/ll/images/favicon.ico" rel="icon" type="image/vnd.microsoft.icon"/>
<link href="/ll/images/favicon.ico" rel="shortcut icon" type="image/x-icon"/>
<link href="/ll/images/favicon.ico" rel="apple-touch-icon"/>
<link href="/cms/css/bootstrap/3.3.5/bootstrap.min.css?1468311567" media="all" rel="stylesheet" type="text/css"/>
<link href="/cms/css/ama.strata.css?1488964337" media="all" rel="stylesheet" type="text/css"/>
<link href="/css/head_footer.css?1495717019" media="all" rel="stylesheet" type="text/css"/>
<link href="/css/fonts.css?1492593457" media="all" rel="stylesheet" type="text/css"/>
<link href="/dist/index-generated.css?1562066943" media="all" rel="stylesheet" type="text/css"/>
<link href="/css/directories.css?1562066740" media="all" rel="stylesheet" type="text/css"/>
<link href="/css/font-awesome/font-awesome.css?1421945712" media="all" rel="stylesheet" type="text/css"/>
<link href="/css/site.css?1495446562" media="all" rel="stylesheet" type="text/css"/>
<link href="/css/survey_responsive.css?1495718076" media="all" rel="stylesheet" type="text/css"/>
<script src="/cms/includes/js/jquery/jquery-1.9.1.js?1470319420" type="text/javascript"></script>
<script src="/cms/includes/js/bootstrap/bootstrap.min.js?1396702205" type="text/javascript"></script>
<script src="/js/jquery/jquery-migrate-1.2.1.min.js?1490174098" type="text/javascript"></script>
<script src="/js/ajax/jqueryui/1.9.1/googleapis.jquery-ui.min.js?1489506075" type="text/javascript"></script>
<script src="/scripts/jquery.ui.map.full.min.js?1489420109" type="text/javascript"></script>
<script src="https://maps.googleapis.com/maps/api/js?key=AIzaSyCMhyPGe9zmfvZQz876tu40kbbzEqBebJk&amp;sensor=false?1490802358" type="text/javascript"></script>
<script src="/cms/includes/js/jquery.watermarkinput.js?1485525553" type="text/javascript"></script>
<script src="/cms/includes/js/jquery.autocomplete.js?1490112323" type="text/javascript"></script>
<script src="/cms/includes/js/ama.ajax.js?1468311510" type="text/javascript"></script>
<script src="/js/compare-ports.js?1491468859" type="text/javascript"></script>
<script src="/js/expand-section.js?1490182977" type="text/javascript"></script>
</head>
<body id="ama-page-">
<div class="ama-level-0" id="ama-layout-">
</div>
<!-- foot body -->
<!-- foot js -->
<script src="/dist/js/index.js?1497352833" type="text/javascript"></script>
<link href="/css/lloydslist.css?1562065235" media="all" rel="stylesheet" type="text/css"/>
<link href="/css/print.css?1492684720" media="print" rel="stylesheet" type="text/css"/>
<!-- BODY clode --> <script type="text/javascript">window.NREUM||(NREUM={});NREUM.info={"beacon":"bam.nr-data.net","licenseKey":"154740f801","applicationID":"14826849,14828730,26854960","transactionName":"NldbbUJZV0tXAEFRXQ8dbEtZF1BWUgZNFloVX1U=","queueTime":0,"applicationTime":153,"atts":"GhBYGwpDREU=","errorBeacon":"bam.nr-data.net","agent":""}</script></body>
</html>

输出不完整。它缺少整个 HTML 正文,并且由于某种原因,输出错误地命名了正文 ID 和 div 类标签:

</head>
    <body id="ama-page-">
    <div class="ama-level-0" id="ama-layout-">
    </div>
    <!-- foot body -->
    <!-- foot js -->

将此与浏览器 (Chrome) 上的“查看页面源”选项进行比较:

     </head>
    <body id='ama-page-21397'>
                    <div id='ama-layout-21687' class='xsite_container ama-level-0'>
                        <div id='ama-layout-21337' class='lloydslist-header ama-level-1'>
                                <div id='ama-section-14735' class='ama-section'>
                                    <div id='ama-section-body-14735' class='sectionbody'>
<div id='ama-field-64687' class='ama-field-html-php'>
<script async src='https://securepubads.g.doubleclick.net/tag/js/gpt.js'></script>
<script>

所以你可以看到 body id 和 div 类略有不同:例如

<body id="ama-page-"> 

对比

<body id='ama-page-21397'>

所以这可能是问题所在,但我是网络抓取的新手,我正在努力寻找解决方案。

这是我想要的完整“查看页面源代码”代码:查看源代码:https ://directories.lloydslist.com/var/recordset/65237/pos/11

非常感谢任何帮助!

注意:我也尝试过使用 Selenium,但仍然没有成功:

url = "https://directories.lloydslist.com/var/recordset/65237/pos/11"
opts = Options()
opts.add_argument("user-agent=whatever you want")
driver = webdriver.Chrome(chrome_options=opts)
driver.get(url)
time.sleep(3)
print(driver.page_source)

标签: javascripthtmlseleniumweb-scrapingbeautifulsoup

解决方案


推荐阅读