首页 > 解决方案 > 试图从 url 中抓取 web 数据使用框架。获取 MSHTML.HTMLDocument 对象中的数据。我想将对象另存为 .xls 在硬盘上

问题描述

我想从 URL 中抓取数据。我在excel中使用VBA。我能够在 MSHTML.HTMLDocument iframeDoc 对象中获取数据。现在我想将 iframeDoc 对象中的数据保存为硬盘上的 excel 文件。iframeDoc.Documentelent.innerHTML 之类的属性都不起作用。它会产生运行时错误。对象不支持该属性。所以我需要帮助将对象转换为字符串类型或任何其他方法来保存到硬盘。谢谢。

我尝试将 MSHTML.HTMLDocument DocumentElemnt 和 Body 的属性保存到字符串。那些给出运行时错误。

 Sub

    Dim ie As SHDocVw.InternetExplorer

    Dim doc As MSHTML.HTMLDocument

    Dim url As String

url = "http://www.1line.williams.com/Transco/info-postings/notices/critical-notices.html"
Set ie = New SHDocVw.InternetExplorer
ie.Visible = True
ie.navigate url

While ie.Busy Or ie.readyState <> READYSTATE_COMPLETE
    DoEvents
Wend

Set doc = ie.document

Dim iframeDoc As MSHTML.HTMLDocument
Set iframeDoc = doc.frames.Item("theiframe")

If iframeDoc Is Nothing Then
    MsgBox "IFrame with name 'iframename' was not found."
    ie.Quit
    Exit Sub
Else
'*** to save iframeDoc object as .xls file on hard disc ***

Dim strframe As String
   strframe = CStr(iframeDoc.DocumentElement.innerHTML) ' Run time error 
    'object does not support the property or method
   End If
  End Sub   

我希望 iframedoc.DocumentElement.innerHTML 被转换为字符串。但是它给出了运行时错误对象不支持该属性或方法。

标签: htmlexcelvbaweb-scraping

解决方案


您需要留出足够的时间让 javascript 在 iframe 中运行和填充表格。我为此使用了一个定时循环

Option Explicit
Public Sub GetInfo()
    Dim ie As InternetExplorer, ws As Worksheet
    Dim hTable As HTMLTable, t As Date
    Const MAX_WAIT_SEC As Long = 10              '<==Adjust wait time
    Set ws = ThisWorkbook.Worksheets("Sheet1")
    Set ie = New InternetExplorer

    With ie
        .Visible = True
        .Navigate2 "http://www.1line.williams.com/Transco/info-postings/notices/critical-notices.html"

        While .Busy Or .readyState < 4: DoEvents: Wend
        t = Timer
        Do
            On Error Resume Next
            With .document.getElementById("theiframe").contentDocument.getElementById("j_idt11")
                Set hTable = .getElementsByTagName("table")(1)
            End With
            On Error GoTo 0
            If Timer - t > MAX_WAIT_SEC Then Exit Do
        Loop While hTable Is Nothing
        If Not hTable Is Nothing Then
            WriteTable hTable, 1, ws
        End If
        .Quit
    End With
End Sub

Public Sub WriteTable(ByVal hTable As HTMLTable, Optional ByVal startRow As Long = 1, Optional ByVal ws As Worksheet)
    If ws Is Nothing Then Set ws = ActiveSheet

    Dim tRow As Object, td As Object, r As Long, c As Long
    r = startRow
    With ws
        Dim headers As Object, header As Object, columnCounter As Long
        Set headers = hTable.getElementsByTagName("th")
        For Each header In headers
            columnCounter = columnCounter + 1
            .Cells(r, columnCounter) = header.innerText
        Next header
        For Each tRow In hTable.getElementsByTagName("tr")
            r = r + 1: c = 1
            For Each td In tRow.getElementsByTagName("td")
                If td.classname = "ui-col-7" Then  'or you could use if c = 7
                    .Cells(r, c).Value = "http://www.1line.williams.com" & Replace$(Split(Split(td.outerhtml, "href=" & Chr$(34))(1), ">")(0), Chr$(34), vbNullString)
                Else
                    .Cells(r, c).Value = td.innerText
                End If
                c = c + 1
            Next td
        Next tRow
    End With
End Sub

推荐阅读