c# - 提取带有位置的文本数据
问题描述
我已经成功地从 pdf 中使用位置数据提取字符数据,但是当我尝试使用位置数据提取单词时,文本并不总是以单个单词的形式出现。像萨斯喀彻温省这样的东西可以拆分为 SA SKA TCH EWAN。有没有什么办法解决这一问题?我遇到了类似的问题,但答案是针对旧版本的 itextsharp。
public class TextLocationStrategy : LocationTextExtractionStrategy
{
public static List<PdfText> CharacterResult = new List<PdfText>();
public static List<PdfText> WordResult = new List<PdfText>();
protected override bool IsChunkAtWordBoundary(TextChunk chunk, TextChunk previousChunk)
{
float dist = chunk.GetLocation().DistanceFromEndOf(previousChunk);
if (dist < -chunk.GetLocation().GetCharSpaceWidth() || dist > chunk.GetLocation().GetCharSpaceWidth() / 4.0f)
{
return true;
}
return false;
}
public override void EventOccurred(IEventData data, EventType type)
{
try
{
if (!type.Equals(EventType.RENDER_TEXT))
{
return;
}
TextRenderInfo renderInfo = (TextRenderInfo)data;
string curFont = renderInfo.GetFont().GetFontProgram().ToString();
float curFontSize = renderInfo.GetFontSize();
IList<TextRenderInfo> text = renderInfo.GetCharacterRenderInfos();
string curText = renderInfo.GetText();
if (curText != " " && !curText.Contains(' '))
{
Vector wordStart = renderInfo.GetBaseline().GetStartPoint();
Vector wordEnd = renderInfo.GetAscentLine().GetEndPoint();
Rectangle wordRect = new Rectangle(wordStart.Get(0), wordStart.Get(1), wordEnd.Get(0) - wordStart.Get(0), wordEnd.Get(1) - wordStart.Get(1));
PdfText chunk = new PdfText
{
Text = curText,
Rectangle = wordRect,
FontFamily = curFont,
FontSize = Convert.ToInt32(curFontSize),
SpaceWidth = renderInfo.GetSingleSpaceWidth(),
HorizontalScaling = renderInfo.GetHorizontalScaling(),
Leading = renderInfo.GetLeading(),
CharacterSpacing = renderInfo.GetCharSpacing(),
StartPoint = new Autodesk.AutoCAD.Geometry.Point3d(wordStart.Get(0), wordStart.Get(1), 0.0),
EndPoint = new Autodesk.AutoCAD.Geometry.Point3d(wordEnd.Get(0), wordEnd.Get(1), 0.0)
};
WordResult.Add(chunk);
}
foreach (TextRenderInfo t in text)
{
string letter = t.GetText();
Vector letterStart = t.GetBaseline().GetStartPoint();
Vector letterEnd = t.GetAscentLine().GetEndPoint();
Rectangle letterRect = new Rectangle(letterStart.Get(0), letterStart.Get(1), letterEnd.Get(0) - letterStart.Get(0), letterEnd.Get(1) - letterStart.Get(1));
if (letter != " " && !letter.Contains(' '))
{
PdfText chunk = new PdfText
{
Text = letter,
Rectangle = letterRect,
FontFamily = curFont,
FontSize = Convert.ToInt32(curFontSize),
SpaceWidth = t.GetSingleSpaceWidth(),
HorizontalScaling = t.GetHorizontalScaling(),
Leading = t.GetLeading(),
CharacterSpacing = t.GetCharSpacing(),
StartPoint = new Autodesk.AutoCAD.Geometry.Point3d(letterStart.Get(0), letterStart.Get(1), 0.0),
EndPoint = new Autodesk.AutoCAD.Geometry.Point3d(letterEnd.Get(0), letterEnd.Get(1), 0.0)
};
CharacterResult.Add(chunk);
}
}
}
catch (Exception ex)
{
ErrorManager.ReportError(ex);
}
}
public List<PdfText> GetCharacterData()
{
List<PdfText> retVal = new List<PdfText>();
try
{
retVal = CharacterResult;
}
catch (Exception ex)
{
ErrorManager.ReportError(ex);
}
return retVal;
}
public List<PdfText> GetWordData()
{
List<PdfText> retVal = new List<PdfText>();
try
{
retVal = WordResult;
}
catch (Exception ex)
{
ErrorManager.ReportError(ex);
}
return retVal;
}
}
public class PdfText
{
public string Text { get; set; }
public Rectangle Rectangle { get; set; }
public string FontFamily { get; set; }
public int FontSize { get; set; }
public float SpaceWidth { get; set; }
public float CharacterSpacing { get; set; }
public float Leading { get; set; }
public float HorizontalScaling { get; set; }
public Point3d StartPoint { get; set; }
public Point3d EndPoint { get; set; }
}
解决方案
推荐阅读
- c++ - Visual Studio 2019 中的 Intellisense 延迟
- python - 如何将 URL 添加到 Telegram Bot 的 InlineKeyboardButton
- python - 计算每行df
- python - pyspark:如何使用过滤器功能将 rdd 与列表进行比较
- javascript - 从 API 计算 NodeJS 中的平均值
- c# - 将具有泛型类型的类存储在单个数组中
- java - 用java中另一个PDF的内容创建一个新的PDF
- iis - 用于 ARR 负载平衡器的 SSL 以管理 2 个网站
- javascript - Promise 和 RxJS 的区别
- linux-kernel - 在基于 linux 的 yocto 中使用 networkmanager 上网