html - 以毫秒为单位用 url 抓取网页 Unix
问题描述
我浏览的网络有几个页面。我想单击这些元素或使用 url,从而能够复制数据。初始 URL 以 = 1 & playerType = ALL & ts = 1558502019375 结尾,在我的代码中,有一个循环应该逐页获取数据,但我无法完成。
Sub UPDATE_DATA_MLB()
Application.ScreenUpdating = False
'dimension (set aside memory for) our variables
Dim objIE As InternetExplorer
Dim ele As Object
Dim y As Integer
Dim EstaPagina As Byte
Dim EstaURL As String
'Página inicial
EstaPagina = 1
'we will output data to excel, starting on row 1
y = 1
EstaURL = "http://mlb.mlb.com/stats/sortable.jsp#elem=%5Bobject+Object%5D&tab_level=child&click_text=Sortable+Player+hitting&game_type='R'&season=2018&season_type=ANY&league_code='MLB'§ionType=sp&statType=hitting&page=1&playerType=ALL&ts=1558502019375" '&ts=1526432697176"
'start a new browser instance
Set objIE = New InternetExplorer
'make browser visible
objIE.Visible = False
Do Until EstaPagina = 255
'navigate to page with needed data
objIE.navigate EstaURL & EstaPagina
'wait for page to load
Do While objIE.Busy = True Or objIE.readyState <> 4: DoEvents: Loop
'If UCase(Left(EstaURL, 211) & "1") = UCase(Left(objIE.LocationURL, (211 + Len(EstaPagina)))) And y > 1 Then Exit Do
'look at all the 'tr' elements in the 'table' with id 'myTable',
'and evaluate each, one at a time, using 'ele' variable
For Each ele In objIE.document.getElementById("datagrid").getElementsByTagName("tr")
'show the text content of 'tr' element being looked at
'Debug.Print ele.textContent
'each 'tr' (table row) element contains 4 children ('td') elements
'put text of 1st 'td' in col A
Sheets("Sheet1").Range("A" & y).Value = ele.Children(0).textContent
'put text of 2nd 'td' in col B
Sheets("Sheet1").Range("B" & y).Value = ele.Children(1).textContent
'put text of 3rd 'td' in col C
Sheets("Sheet1").Range("C" & y).Value = ele.Children(2).textContent
'put text of 4th 'td' in col D
Sheets("Sheet1").Range("D" & y).Value = ele.Children(5).textContent
'put text of 4th 'td' in col f
Sheets("Sheet1").Range("E" & y).Value = ele.Children(22).textContent
'increment row counter by 1
y = y + 1
Next
EstaPagina = EstaPagina + 1
Loop
lobjIE.Quit
Set objIE = Nothing
Set ele = Nothing
Range("A1").Select
Range(Selection, Selection.End(xlToRight)).Select
Range(Selection, Selection.End(xlDown)).Select
Selection.RemoveDuplicates Columns:=Array(1, 2, 3, 4, 5), _
Header:=xlNo
Application.ScreenUpdating = True
MsgBox "Volcado terminado", vbInformation
Range("A1").Select
'save the Excel workbook
ActiveWorkbook.Save
End Sub ```
解决方案
XMLHTTP:
该页面执行 ajax 调用以获取它用于更新每个页面的内容的 json。它使用查询字符串参数来执行此操作,其中之一是每页的记录数(默认为 50)。您可以通过开发工具在浏览器网络选项卡中查看此活动F12
您可以避免使用浏览器并向同一端点发出 xmlhttp 请求,更改查询字符串参数以获取所有结果。看来,在我的测试中,您不能一次请求所有结果。相反,我发出 1000 个结果的请求,然后检查是否有任何其他结果要获取,并以 1000 个为一组发出进一步的请求以获取它们。
我正在使用jsonconverter.bas来解析 json。我从 json 中提取信息并加载到一个数组中results
,这样我就可以一次将所有结果写到工作表中——这样可以更有效地减少工作表的 i/o。
将上面链接中的代码复制到名为 jsonConverter 的模块后,您需要转到“VBE > 工具 > 参考 > 添加对 Microsoft Scripting Runtime 的引用
去做:
- 为失败的请求添加错误处理
VBA:
Option Explicit
Public Sub GetResults()
'VBE > Tools > References > Microsoft Scripting Runtime
Dim ws As Worksheet, results(), i As Long, totalResults As Long
Dim headers(), columnCount As Long, pageNumber As Long
Dim numberOfPages As Long, resultsPerPage As Long, json As Object
resultsPerPage = 1000
pageNumber = 1
Set ws = ThisWorkbook.Worksheets("Sheet1")
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "http://mlb.mlb.com/pubajax/wf/flow/stats.splayer?season=2018&sort_order=%27desc%27&sort_column=%27avg%27&stat_type=hitting&page_type=SortablePlayer&game_type=%27R%27&player_pool=ALL&season_type=ANY&sport_code=%27mlb%27&results=" & resultsPerPage & "&recSP=" & pageNumber & "&recPP=" & resultsPerPage, False
.send
Set json = JsonConverter.ParseJson(.responseText)
totalResults = json("stats_sortable_player")("queryResults")("totalSize")
headers = json("stats_sortable_player")("queryResults")("row").item(1).keys
numberOfPages = json("stats_sortable_player")("queryResults")("totalP")
columnCount = UBound(headers) + 1
ReDim results(1 To totalResults, 1 To columnCount)
Dim r As Long, c As Long, dict As Object, key As Variant
For pageNumber = 1 To numberOfPages
If pageNumber > 1 Then
.Open "GET", "http://mlb.mlb.com/pubajax/wf/flow/stats.splayer?season=2018&sort_order=%27desc%27&sort_column=%27avg%27&stat_type=hitting&page_type=SortablePlayer&game_type=%27R%27&player_pool=ALL&season_type=ANY&sport_code=%27mlb%27&results=" & resultsPerPage & "&recSP=" & pageNumber & "&recPP=" & resultsPerPage, False
.send
Set json = JsonConverter.ParseJson(.responseText)
End If
For Each dict In json("stats_sortable_player")("queryResults")("row")
r = r + 1: c = 1
For Each key In dict.keys
results(r, c) = dict(key)
c = c + 1
Next
Next
Next
End With
With ws
.Cells(1, 1).Resize(1, columnCount) = headers
.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
End With
End Sub
输出示例(欣赏不能真正按原样阅读 - 但布局的想法):
图片链接:https ://i.stack.imgur.com/jiDTP.png
IE浏览器:
如果您想使用较慢的浏览器解决方案,您可以将页码连接到 url 并循环以覆盖所有页面。可以从第 1 页的分页中提取页数。
通过查看此答案,您可以了解如何将表格写在彼此下方。GetLastRow(ws, 1) + 2
用to改变行GetLastRow(ws, 1) + 1
Option Explicit
'VBE > Tools > References: Microsoft Internet Controls
Public Sub GetData()
Dim ie As New InternetExplorer, numberOfPages As Long
Dim url As String, i As Long
Const PLAYERS_PER_PAGE = 50
url = "http://mlb.mlb.com/stats/sortable.jsp#elem=%5Bobject+Object%5D&tab_level=child&click_text=Sortable+Player+hitting&game_type='R'&season=2018&season_type=ANY&league_code='MLB'§ionType=sp&statType=hitting&page=1&playerType=ALL&ts="
With ie
.Visible = True
.Navigate2 url
While .Busy Or .readyState < 4: DoEvents: Wend
With .document
numberOfPages = CLng(.querySelector(".paginationWidget-last").innerText)
'do something with page 1
If numberOfPages > 1 Then
For i = 2 To numberOfPages
ie.Navigate2 Replace$(url, "page=1", "page=" & CStr(i))
While ie.Busy Or ie.readyState < 4: DoEvents: Wend
'do something with other pages
Next
Stop 'delete me later
End If
End With
.Quit
End With
End Sub
推荐阅读
- django - Django REST 框架:通过可浏览 API 中的 UpdateAPIView 与 RetrieveUpdateDestroyAPIView / 预填充表单进行更新
- javascript - 带有打字稿的样式化组件:没有提供道具的错误消息 - 没有重载匹配此调用
- ruby-on-rails - 错误验证时Rails渲染嵌套属性不显示字段
- javascript - 在聚合物js中的javascript文件中格式化html代码
- javascript - 刷新浏览器时如何在 Ionic 3 中重新加载当前页面?
- azure - Azure 数据工厂管道 - 如果内部活动失败
- python - Selenium WhatsApp 异常“元素不可交互”
- python - subprocess.Popen:无法在 Windows 上刷新标准输入进程
- oracle - oracle regexp_like & regexp_count 与变量/表属性
- salesforce - 仪表板中的日期格式