首页 > 解决方案 > 使用 VBA 代码如何从 word 文档中提取位于每个标题下的非 HTML 数据内容

问题描述

无论标题样式如何,如何提取与每个标题相关的文本和非文本数据内容(例如:表格、图片)?

使用下面的代码,我可以访问每个标题,发布我无法提取与该标题关联的内容:

Option Explicit
Sub Main()
        Dim strFile As String
        Dim oWord As Word.Application
        Dim oWdoc As Word.Document
        Dim oPar As Word.Paragraph
        Dim rng As Word.Range
        strFile = "C:\Users\SQVA\Desktop\My_Work\MyTest3.docx"
        'Set oWord = CreateObject("Word.Application")
        Set oWord = New Word.Application
        Set oWdoc = oWord.Documents.Open(strFile)
    Call Get_Heading_Name(oWord, oWdoc, strFile, rng)
    Call Close_Word(oWord, oWdoc)
End Sub  

   Sub Get_Heading_Name(oWord As Word.Application, oWdoc As Word.Document, strFile As String, rng As Word.Range)
    oWord.Visible = True
    Dim astrHeadings As Variant
    Dim strText As String
    Dim intItem As Integer
    Set rng = oWdoc.Content
    astrHeadings = _
    oWdoc.GetCrossReferenceItems(wdRefTypeHeading)
    For intItem = LBound(astrHeadings) To UBound(astrHeadings)
            strText = Trim$(astrHeadings(intItem))
            'Debug.Print CStr(strText)
            'Debug.Print astrHeadings(intItem).
            Dim my_String As String
            Dim intLevel
            If CStr(strText) <> "" Then
                my_String = Right(strText, Len(strText) - InStr(strText, " "))
                intLevel = GetLevel(CStr(astrHeadings(intItem)))
               ' Call GetHeadingNextText(oWdoc, my_String)
               ' Debug.Print my_String
               ' Debug.Print intLevel
               ' rng.Style = "Heading " & intLevel
                Dim sTextSearch() As String
                Dim StrHdTxt1
                Dim nStart As Long, nEnd As Long, n As Long, k As Long
                Dim wdTable
                Dim wdTbl As Word.Table, wdCell As Word.cell, wdCellRng As Word.Range
                Dim wdIshp As Word.InlineShape, wdShp As Word.Shape, StrHdTxt As String
                oWdoc.Range(0, 0).Select
                With oWord.Selection.Find
                    .Style = oWdoc.Styles("Heading " & intLevel)
                    .Text = my_String
                    If .Execute Then
                        'Debug.Print "Found"
                        Call SelectHeadingandContent(oWdoc, oWord)
                    End If
                End With
            End If
    Next intItem
End Sub  

Sub Close_Word(oWord As Word.Application, oWdoc As Word.Document)
        oWdoc.Close SaveChanges:=wdDoNotSaveChanges
        oWord.Quit
        Set oWdoc = Nothing
        Set oWord = Nothing
End Sub

Private Function GetLevel(strItem As String) As Integer
    ' Return the heading level of a header from the
    ' array returned by Word.

    ' The number of leading spaces indicates the
    ' outline level (2 spaces per level: H1 has
    ' 0 spaces, H2 has 2 spaces, H3 has 4 spaces.

    Dim strTemp As String
    Dim strOriginal As String
    Dim longDiff As Integer

    ' Get rid of all trailing spaces.
    strOriginal = RTrim$(strItem)

    ' Trim leading spaces, and then compare with
    ' the original.
    strTemp = LTrim$(strOriginal)

    ' Subtract to find the number of
    ' leading spaces in the original string.
    longDiff = Len(strOriginal) - Len(strTemp)
    GetLevel = (longDiff / 2) + 1
End Function

Sub SelectHeadingandContent(oWdoc As Word.Document, oWord As Word.Application)
Dim headStyle 'As Style
' Checks that you have selected a heading. If you have selected multiple paragraphs,checks only the first one. If you have selected a heading, makes sure the whole paragraph is selected and records the style. If not, exits the subroutine.
If oWdoc.Styles(oWord.Selection.Paragraphs(1).Style).ParagraphFormat.OutlineLevel < wdOutlineLevelBodyText Then
    Set headStyle = oWord.Selection.Paragraphs(oWord.Selection.Paragraphs.Count).Style
    oWord.Selection.Expand wdParagraph
Else: Exit Sub
End If

' Turns off screen updating so the the screen does not flicker.

Application.ScreenUpdating = False

' Loops through the paragraphs following your selection, and incorporates them into the selection as long as they have a higher outline level than the selected heading (which corresponds to a lower position in the document hierarchy). Exits the loop if there are no more paragraphs in the document.
Dim My_Text As String
My_Text = ""
Do While oWdoc.Styles(oWord.Selection.Paragraphs(oWord.Selection.Paragraphs.Count).Next.Style).ParagraphFormat.OutlineLevel > headStyle.ParagraphFormat.OutlineLevel
     'Debug.Print oWord.Selection.Paragraphs(oWord.Selection.Paragraphs.Count).Range.Text
     oWord.Selection.MoveEnd wdParagraph
     ' Debug.Print oWord.Selection.Paragraphs(oWord.Selection.Paragraphs.Count).Range.Text
     My_Text = My_Text + vbCr + oWord.Selection.Paragraphs(oWord.Selection.Paragraphs.Count).Range.Text
     If oWord.Selection.Paragraphs(oWord.Selection.Paragraphs.Count).Next Is Nothing Then Exit Do
Loop
Debug.Print My_Text
' Turns screen updating back on.

Application.ScreenUpdating = True
End Sub

标签: vbams-word

解决方案


您可以使用以下代码循环遍历所有 Heading1 范围及其“非文本”对象,如您所称:

Sub Read_Heading_Contents()
Dim wdApp As New Word.Application, wdDoc As Word.Document, wdRng As Word.Range
Dim wdTbl As Word.Table, wdCell As Word.Cell, wdCellRng As Word.Range
Dim wdIshp As Word.InlineShape, wdShp As Word.Shape, StrHdTxt As String
Const strFile As String = "C:\Users\SQVA\Desktop\My_Work\MyTest3.docx"
With wdApp
  .Visible = True
  Set wdDoc = .Documents.Open(Filename:=strFile, ReadOnly:=True, AddToRecentFiles:=False, Visible:=False)
  With wdDoc
    With .Range
      With .Find
        .Style = wdStyleHeading1
        .Text = ""
        .Wrap = wdFindStop
        .Execute
      End With
      If .Find.Found = False Then
        MsgBox "No 'Heading 1' style found."
      Else
        Do While .Find.Found = True
          StrHdTxt = .Duplicate.Text: MsgBox StrHdTxt
          Set wdRng = .Duplicate.GoTo(What:=wdGoToBookmark, Name:="\HeadingLevel")
          For Each wdTable In .Tables
            With wdTbl
              For Each wdCell In .Range.Cells
                Set wdCellRng = wdCell.Range
                wdCellRng.End = wdCellRng.End - 1
                MsgBox wdCellRng.Text
              Next
            End With
          Next
          For Each wdIshp In wdRng.InlineShapes
            With wdIshp
              If Not .TextEffect Is Nothing Then
                MsgBox .TextEffect.Text
              End If
            End With
          Next
          For Each wdShp In wdRng.ShapeRange
            With wdShp
              If Not .TextFrame Is Nothing Then
                MsgBox .TextFrame.TextRange.Text
              End If
            End With
          Next
          .Collapse wdCollapseEnd
          .Find.Execute
        Loop
      End If
    End With
    .Close SaveChanges:=wdDoNotSaveChanges
  End With
  .Quit
End With
Set wdRng = Nothing: Set wdDoc = Nothing: Set wdApp = Nothing
End Sub

上面的代码包括用于显示标题名称以及在标题范围的“非文本”内容中找到的任何内容的消息框。我会留给您将文本框输出转换为您想要的任何其他内容。当然,并非所有内联和浮动形状都有文本;循环也找到了那些,但我不知道你打算如何“阅读”那些。


推荐阅读