vba - 使用 VBA 代码如何从 word 文档中提取位于每个标题下的非 HTML 数据内容
问题描述
无论标题样式如何,如何提取与每个标题相关的文本和非文本数据内容(例如:表格、图片)?
使用下面的代码,我可以访问每个标题,发布我无法提取与该标题关联的内容:
Option Explicit
Sub Main()
Dim strFile As String
Dim oWord As Word.Application
Dim oWdoc As Word.Document
Dim oPar As Word.Paragraph
Dim rng As Word.Range
strFile = "C:\Users\SQVA\Desktop\My_Work\MyTest3.docx"
'Set oWord = CreateObject("Word.Application")
Set oWord = New Word.Application
Set oWdoc = oWord.Documents.Open(strFile)
Call Get_Heading_Name(oWord, oWdoc, strFile, rng)
Call Close_Word(oWord, oWdoc)
End Sub
Sub Get_Heading_Name(oWord As Word.Application, oWdoc As Word.Document, strFile As String, rng As Word.Range)
oWord.Visible = True
Dim astrHeadings As Variant
Dim strText As String
Dim intItem As Integer
Set rng = oWdoc.Content
astrHeadings = _
oWdoc.GetCrossReferenceItems(wdRefTypeHeading)
For intItem = LBound(astrHeadings) To UBound(astrHeadings)
strText = Trim$(astrHeadings(intItem))
'Debug.Print CStr(strText)
'Debug.Print astrHeadings(intItem).
Dim my_String As String
Dim intLevel
If CStr(strText) <> "" Then
my_String = Right(strText, Len(strText) - InStr(strText, " "))
intLevel = GetLevel(CStr(astrHeadings(intItem)))
' Call GetHeadingNextText(oWdoc, my_String)
' Debug.Print my_String
' Debug.Print intLevel
' rng.Style = "Heading " & intLevel
Dim sTextSearch() As String
Dim StrHdTxt1
Dim nStart As Long, nEnd As Long, n As Long, k As Long
Dim wdTable
Dim wdTbl As Word.Table, wdCell As Word.cell, wdCellRng As Word.Range
Dim wdIshp As Word.InlineShape, wdShp As Word.Shape, StrHdTxt As String
oWdoc.Range(0, 0).Select
With oWord.Selection.Find
.Style = oWdoc.Styles("Heading " & intLevel)
.Text = my_String
If .Execute Then
'Debug.Print "Found"
Call SelectHeadingandContent(oWdoc, oWord)
End If
End With
End If
Next intItem
End Sub
Sub Close_Word(oWord As Word.Application, oWdoc As Word.Document)
oWdoc.Close SaveChanges:=wdDoNotSaveChanges
oWord.Quit
Set oWdoc = Nothing
Set oWord = Nothing
End Sub
Private Function GetLevel(strItem As String) As Integer
' Return the heading level of a header from the
' array returned by Word.
' The number of leading spaces indicates the
' outline level (2 spaces per level: H1 has
' 0 spaces, H2 has 2 spaces, H3 has 4 spaces.
Dim strTemp As String
Dim strOriginal As String
Dim longDiff As Integer
' Get rid of all trailing spaces.
strOriginal = RTrim$(strItem)
' Trim leading spaces, and then compare with
' the original.
strTemp = LTrim$(strOriginal)
' Subtract to find the number of
' leading spaces in the original string.
longDiff = Len(strOriginal) - Len(strTemp)
GetLevel = (longDiff / 2) + 1
End Function
Sub SelectHeadingandContent(oWdoc As Word.Document, oWord As Word.Application)
Dim headStyle 'As Style
' Checks that you have selected a heading. If you have selected multiple paragraphs,checks only the first one. If you have selected a heading, makes sure the whole paragraph is selected and records the style. If not, exits the subroutine.
If oWdoc.Styles(oWord.Selection.Paragraphs(1).Style).ParagraphFormat.OutlineLevel < wdOutlineLevelBodyText Then
Set headStyle = oWord.Selection.Paragraphs(oWord.Selection.Paragraphs.Count).Style
oWord.Selection.Expand wdParagraph
Else: Exit Sub
End If
' Turns off screen updating so the the screen does not flicker.
Application.ScreenUpdating = False
' Loops through the paragraphs following your selection, and incorporates them into the selection as long as they have a higher outline level than the selected heading (which corresponds to a lower position in the document hierarchy). Exits the loop if there are no more paragraphs in the document.
Dim My_Text As String
My_Text = ""
Do While oWdoc.Styles(oWord.Selection.Paragraphs(oWord.Selection.Paragraphs.Count).Next.Style).ParagraphFormat.OutlineLevel > headStyle.ParagraphFormat.OutlineLevel
'Debug.Print oWord.Selection.Paragraphs(oWord.Selection.Paragraphs.Count).Range.Text
oWord.Selection.MoveEnd wdParagraph
' Debug.Print oWord.Selection.Paragraphs(oWord.Selection.Paragraphs.Count).Range.Text
My_Text = My_Text + vbCr + oWord.Selection.Paragraphs(oWord.Selection.Paragraphs.Count).Range.Text
If oWord.Selection.Paragraphs(oWord.Selection.Paragraphs.Count).Next Is Nothing Then Exit Do
Loop
Debug.Print My_Text
' Turns screen updating back on.
Application.ScreenUpdating = True
End Sub
解决方案
您可以使用以下代码循环遍历所有 Heading1 范围及其“非文本”对象,如您所称:
Sub Read_Heading_Contents()
Dim wdApp As New Word.Application, wdDoc As Word.Document, wdRng As Word.Range
Dim wdTbl As Word.Table, wdCell As Word.Cell, wdCellRng As Word.Range
Dim wdIshp As Word.InlineShape, wdShp As Word.Shape, StrHdTxt As String
Const strFile As String = "C:\Users\SQVA\Desktop\My_Work\MyTest3.docx"
With wdApp
.Visible = True
Set wdDoc = .Documents.Open(Filename:=strFile, ReadOnly:=True, AddToRecentFiles:=False, Visible:=False)
With wdDoc
With .Range
With .Find
.Style = wdStyleHeading1
.Text = ""
.Wrap = wdFindStop
.Execute
End With
If .Find.Found = False Then
MsgBox "No 'Heading 1' style found."
Else
Do While .Find.Found = True
StrHdTxt = .Duplicate.Text: MsgBox StrHdTxt
Set wdRng = .Duplicate.GoTo(What:=wdGoToBookmark, Name:="\HeadingLevel")
For Each wdTable In .Tables
With wdTbl
For Each wdCell In .Range.Cells
Set wdCellRng = wdCell.Range
wdCellRng.End = wdCellRng.End - 1
MsgBox wdCellRng.Text
Next
End With
Next
For Each wdIshp In wdRng.InlineShapes
With wdIshp
If Not .TextEffect Is Nothing Then
MsgBox .TextEffect.Text
End If
End With
Next
For Each wdShp In wdRng.ShapeRange
With wdShp
If Not .TextFrame Is Nothing Then
MsgBox .TextFrame.TextRange.Text
End If
End With
Next
.Collapse wdCollapseEnd
.Find.Execute
Loop
End If
End With
.Close SaveChanges:=wdDoNotSaveChanges
End With
.Quit
End With
Set wdRng = Nothing: Set wdDoc = Nothing: Set wdApp = Nothing
End Sub
上面的代码包括用于显示标题名称以及在标题范围的“非文本”内容中找到的任何内容的消息框。我会留给您将文本框输出转换为您想要的任何其他内容。当然,并非所有内联和浮动形状都有文本;循环也找到了那些,但我不知道你打算如何“阅读”那些。
推荐阅读
- single-page-application - SPA 产品购物车
- jenkins - 从 Jenkinsfile 中的 jenkins 作业配置中读取 SVN URL
- node.js - 猫鼬和本地化
- android - 更改片段时Android livedata失去活跃的观察者
- javascript - Mac OS X - NPM + Jasmine 未安装。每次终端关闭时都必须重新安装
- julia - Julia 点矢量化 - 我需要矢量化到调用链的深度吗?
- javascript - 使用 javascript 设置属性
- mongodb - 使用 MongoDB 的第一周保留
- python - try except 块中的 Python break 语句
- java - rmi 服务器监听 2 个不同的 ip