首页 > 解决方案 > SAS Proc Groovy - Apache PDFBox 库无法从 pdfbox.jar 文件中读取资源

问题描述

目标:阅读PDF文件并将其转换为文本格式。

我为此使用 SAS“Proc Groovy”和“Java Apache PDFBox Library”。但是代码给出了错误,因为 Apache PDFBox 版本已从 2.0.21 更改为 2.0.22。请建议必须对此代码进行哪些更改才能再次使用它。

SAS Proc Groovy 可以运行 Java 代码。因此,我们在 SAS Proc Groovy 中使用 Java PDF 库 (Apache PDFBox) 将 PDF 转换为文本格式。

代码:

filename overview "&temp/overview.pdf";
filename ov_text  "&temp/overview.txt";

* download a pdf document;
proc http           
url="https://cdn.nar.realtor/sites/default/files/documents/ehs-11-2020-overview-2020-12-22.pdf"       
method="get"        
proxyhost="&proxy_host."        
proxyport=&port         
out=overview;     
run;

* download the Apache PDFBox library (a .jar file); 
filename jar '&temp/pdfbox.jar';

%if %sysfunc(FEXIST(jar)) ne 1 %then %do;
proc http
url='https://www.apache.org/dyn/closer.lua?filename=pdfbox/2.0.22/pdfbox-app-2.0.22.jar&action=download'
proxyhost="&proxy_host."        
proxyport=&port
out=jar;
run;
%end;

* Use GROOVY to read the PDF, strip out the text and position, and write that
* parse to a text file which SAS can read;

proc groovy classpath="&temp.jar"; 
  submit 
    "%sysfunc(pathname(overview))"  /* the input, a pdf file */
    "%sysfunc(pathname(ov_text))"   /* the output, a text file */
  ;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.List;
import java.io.FileWriter;
import java.io.PrintWriter;

public class GetLinesFromPDF extends PDFTextStripper {
    
    static List<String> lines = new ArrayList<String>();
    public GetLinesFromPDF() throws IOException {
    }
    /**
     * @throws IOException If there is an error parsing the document.
     */
    public static void main( String[] args ) throws IOException {
        PDDocument document = null;
        PrintWriter out = null;
        String inPdf = args[0];
        String outTxt = args[1];

        try {
            document = PDDocument.load( new File(inPdf) );

            PDFTextStripper stripper = new GetLinesFromPDF();

            stripper.setSortByPosition( true );
            stripper.setStartPage( 0 );
            stripper.setEndPage( document.getNumberOfPages() );

            Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
            stripper.writeText(document, dummy);
            
            out = new PrintWriter(new FileWriter(outTxt));

            // print lines to text file
            for(String line:lines){
              out.println(line); 
            }
        }
        finally {
            if( document != null ) {
                document.close();
            }
            if( out != null ) {
                out.close();
            }
        }
    }
    /**
     * Override the default functionality of PDFTextStripper.writeString()
     */
    @Override
    protected void writeString(String str, List<TextPosition> textPositions) throws IOException {
        String places = "";

        for(TextPosition tp:textPositions){
          places += "(" + tp.getX() + "," + tp.getY() + ") ";
        }

        lines.add(str + " found @ " + places);
    }
}

  endsubmit;
quit;

* preview the stripped text that was saved;

data _null_;
  infile ov_text;
  input;
  putlog _infile_;
run;

运行上述代码后的部分错误消息:

    ERROR: The SUBMIT command failed.
    groovy.lang.GroovyRuntimeException: Failed to create Script instance for class: class GetLinesFromPDF. Reason: 
    java.util.zip.ZipException: error reading zip file
         at org.codehaus.groovy.runtime.InvokerHelper.createScript(InvokerHelper.java:475)
         at groovy.lang.GroovyShell.parse(GroovyShell.java:689)
         at groovy.lang.GroovyShell.parse(GroovyShell.java:725)
         at groovy.lang.GroovyShell.parse(GroovyShell.java:716)
    Caused by: java.util.zip.ZipException: error reading zip file
         at java.util.zip.ZipFile.read(Native Method)
         at java.util.zip.ZipFile.access$1400(ZipFile.java:60)
         at java.util.zip.ZipFile$ZipFileInputStream.read(ZipFile.java:734)
         at java.util.zip.ZipFile$ZipFileInflaterInputStream.fill(ZipFile.java:434)
         at java.util.zip.InflaterInputStream.read(InflaterInputStream.java:158)
         at java.io.FilterInputStream.read(FilterInputStream.java:133)
         at sun.nio.cs.StreamDecoder.readBytes(StreamDecoder.java:284)
         at sun.nio.cs.StreamDecoder.implRead(StreamDecoder.java:326)
         at sun.nio.cs.StreamDecoder.read(StreamDecoder.java:178)
         at java.io.InputStreamReader.read(InputStreamReader.java:184)
         at java.io.BufferedReader.fill(BufferedReader.java:161)
         at java.io.BufferedReader.readLine(BufferedReader.java:324)
         at java.io.BufferedReader.readLine(BufferedReader.java:389)
         at org.apache.pdfbox.pdmodel.font.encoding.GlyphList.loadList(GlyphList.java:147)
         at org.apache.pdfbox.pdmodel.font.encoding.GlyphList.<init>(GlyphList.java:137)
         at org.apache.pdfbox.text.LegacyPDFStreamEngine.<init>(LegacyPDFStreamEngine.java:120)
         at org.apache.pdfbox.text.PDFTextStripper.<init>(PDFTextStripper.java:214)

标签: javasaspdfboxpdftotext

解决方案


推荐阅读