首页 > 解决方案 > 提取带有位置的文本数据

问题描述

我已经成功地从 pdf 中使用位置数据提取字符数据,但是当我尝试使用位置数据提取单词时,文本并不总是以单个单词的形式出现。像萨斯喀彻温省这样的东西可以拆分为 SA SKA TCH EWAN。有没有什么办法解决这一问题?我遇到了类似的问题,但答案是针对旧版本的 itextsharp。

  public class TextLocationStrategy : LocationTextExtractionStrategy
  {

    public static List<PdfText> CharacterResult = new List<PdfText>();
    public static List<PdfText> WordResult = new List<PdfText>();

    protected override bool IsChunkAtWordBoundary(TextChunk chunk, TextChunk previousChunk)
    {
      float dist = chunk.GetLocation().DistanceFromEndOf(previousChunk);
      if (dist < -chunk.GetLocation().GetCharSpaceWidth() || dist > chunk.GetLocation().GetCharSpaceWidth() / 4.0f)
      {
        return true;
      }

      return false;
    }

    public override void EventOccurred(IEventData data, EventType type)
    {
      try
      {
        if (!type.Equals(EventType.RENDER_TEXT))
        {
          return;
        }

        TextRenderInfo renderInfo = (TextRenderInfo)data;
        string curFont = renderInfo.GetFont().GetFontProgram().ToString();
        float curFontSize = renderInfo.GetFontSize();
        IList<TextRenderInfo> text = renderInfo.GetCharacterRenderInfos();

        string curText = renderInfo.GetText();
        if (curText != " " && !curText.Contains(' '))
        {
          Vector wordStart = renderInfo.GetBaseline().GetStartPoint();
          Vector wordEnd = renderInfo.GetAscentLine().GetEndPoint();
          Rectangle wordRect = new Rectangle(wordStart.Get(0), wordStart.Get(1), wordEnd.Get(0) - wordStart.Get(0), wordEnd.Get(1) - wordStart.Get(1));

          PdfText chunk = new PdfText
          {
            Text = curText,
            Rectangle = wordRect,
            FontFamily = curFont,
            FontSize = Convert.ToInt32(curFontSize),
            SpaceWidth = renderInfo.GetSingleSpaceWidth(),
            HorizontalScaling = renderInfo.GetHorizontalScaling(),
            Leading = renderInfo.GetLeading(),
            CharacterSpacing = renderInfo.GetCharSpacing(),
            StartPoint = new Autodesk.AutoCAD.Geometry.Point3d(wordStart.Get(0), wordStart.Get(1), 0.0),
            EndPoint = new Autodesk.AutoCAD.Geometry.Point3d(wordEnd.Get(0), wordEnd.Get(1), 0.0)
          };
          WordResult.Add(chunk);
        }

        foreach (TextRenderInfo t in text)
        {
          string letter = t.GetText();
          Vector letterStart = t.GetBaseline().GetStartPoint();
          Vector letterEnd = t.GetAscentLine().GetEndPoint();
          Rectangle letterRect = new Rectangle(letterStart.Get(0), letterStart.Get(1), letterEnd.Get(0) - letterStart.Get(0), letterEnd.Get(1) - letterStart.Get(1));

          if (letter != " " && !letter.Contains(' '))
          {
            PdfText chunk = new PdfText
            {
              Text = letter,
              Rectangle = letterRect,
              FontFamily = curFont,
              FontSize = Convert.ToInt32(curFontSize),
              SpaceWidth = t.GetSingleSpaceWidth(),
              HorizontalScaling = t.GetHorizontalScaling(),
              Leading = t.GetLeading(),
              CharacterSpacing = t.GetCharSpacing(),
              StartPoint = new Autodesk.AutoCAD.Geometry.Point3d(letterStart.Get(0), letterStart.Get(1), 0.0),
              EndPoint = new Autodesk.AutoCAD.Geometry.Point3d(letterEnd.Get(0), letterEnd.Get(1), 0.0)
            };
            CharacterResult.Add(chunk);
          }
        }
      }
      catch (Exception ex)
      {
        ErrorManager.ReportError(ex);
      }
    }

    public List<PdfText> GetCharacterData()
    {
      List<PdfText> retVal = new List<PdfText>();

      try
      {
        retVal = CharacterResult;
      }
      catch (Exception ex)
      {
        ErrorManager.ReportError(ex);
      }

      return retVal;
    }

    public List<PdfText> GetWordData()
    {
      List<PdfText> retVal = new List<PdfText>();

      try
      {
        retVal = WordResult;
      }
      catch (Exception ex)
      {
        ErrorManager.ReportError(ex);
      }

      return retVal;
    }

  }


  public class PdfText
  {

    public string Text { get; set; }
    public Rectangle Rectangle { get; set; }
    public string FontFamily { get; set; }
    public int FontSize { get; set; }
    public float SpaceWidth { get; set; }
    public float CharacterSpacing { get; set; }
    public float Leading { get; set; }
    public float HorizontalScaling { get; set; }
    public Point3d StartPoint { get; set; }
    public Point3d EndPoint { get; set; }


  }

标签: c#pdfitextpdf-generationextract

解决方案


推荐阅读