首页 > 解决方案 > 尝试利用从 PDF 中捕获的文本

问题描述

我有一个名为的数组myArray,其中包含由空格分隔的单词,并从 PDF 的第一页到最后一页进行修剪。我写了一个简单的print数组方法,它遍历并逐个打印每个元素,看起来很棒!

在我让它通过另一个for loop数组的长度并检查后立即将数组if (myArray[i].equals("(19)")) {//print something}打印到控制台时,很明显该值(19)存在于数组中。

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Scanner;

import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.io.RandomAccessRead;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;

public class Main {

    static File file;
    static PDFTextStripper textStripper;
    static PDDocument pdDoc;
    static COSDocument cosDoc;
    static String parsedText;
    static int sum = 0;
    static String[] myArray;
    static String[] events = {"400", "800", "1500",
            "3000", "5000", "10000"};

    public static void main(String[] args) {

        //Read the PDF file into instance variable file
        readFile();

        try {
            parsePDF(file);
        } catch (IOException e) {
            e.printStackTrace();
        }

        myArray = parsedText.split(" ");
        removeWhiteSpace(myArray);
        printArray(myArray);
        //System.out.println();

        String currentEvent = "";
        for (int i = 0; i < myArray.length; i++) {

            if (contains(myArray[i])) {
                currentEvent = myArray[i];
            }

            if (!currentEvent.equals("")) {
                if (myArray[i].charAt(0) == '(' && (myArray[i].charAt(myArray[i].length() - 1) == ')')) {

                    String formatedRunners = "";

                    //It is possible to see some numbers such as (19)) or (19)
                    if (containsCharacter(myArray[i], ')') == 2) {
                        formatedRunners = myArray[i].substring(1, myArray[i].length() - 2);
                    } else {
                        formatedRunners = myArray[i].substring(1, myArray[i].length() - 1);
                    }

                    int numberOfRunners = Integer.parseInt(formatedRunners);
                    int distance = Integer.parseInt(currentEvent);


                    sum += numberOfRunners * distance;

                    //reset currentEvent
                    currentEvent = "";
                }
            }
        }
        //Print total distance in meters
        System.out.println(sum + " meters");

        //Convert meters to miles using the following equation: meters / 1609.344 
        System.out.println( Math.round((sum / 1609.344)) + " miles");
    }

    public static void readFile() {
        Scanner c = new Scanner(System.in);
        System.out.println("Enter a file path: ");
        String filePath = c.nextLine();
        file = new File(filePath);
    }

    public static void parsePDF(File file) throws IOException {

        textStripper = new PDFTextStripper();
        pdDoc = PDDocument.load(file);

        //Parse PDF
        textStripper.setStartPage(1);
        //textStripper.setEndPage();
        //Parsed String
        parsedText = textStripper.getText(pdDoc);

    }

    public static boolean contains(String s) {
        for (int i = 0; i < events.length; i++) {
            if (s.equals(events[i])) {
                return true;
            }
        }
        return false;
    }

    public static void printArray(String[] a) {
        for (int i = 0; i < a.length; i++) {
            System.out.println(a[i]);
        }

    }

    public static void removeWhiteSpace(String[] a) {
        for (int i = 0; i < myArray.length; i++) {

            if (myArray[i].equals("")) {

                //Use some filler to avoid crashes when checking characters
                myArray[i] = "NULL";
            }

            //Trim off all extra whitespace
            myArray[i] = myArray[i].trim();
        }
    }

    public static int containsCharacter(String str, char c) {

        int count = 0;
        for (int i = 0; i < str.length(); i++) {
            if (str.charAt(i) == c) {
                count++;
            }
        }
        return count;
    }
}

这是我想要的:

  1. 解析和修剪等(OK)
  2. 迭代myArray(在 main 方法中)并检测事件(OK)
  3. 如果发生事件,则下一个值必须(Any number)类似于(19) (NOK)
  4. 步骤 3 中的数字将用于计算另一个数字
  5. 重置当前事件以一遍又一遍地重复该过程。

似乎它正在正确读取每个事件,但只选择 (19)) 而不是 (19)。

标签: javaarrays

解决方案


您的代码中有几个问题(无异常处理、一切都是静态的、小错误等),但我将专注于主要问题。(我删除了我没有更改的代码)

public class Main {

static File file;
static PDFTextStripper textStripper;
static PDDocument pdDoc;
static COSDocument cosDoc;
static String parsedText;
static int sum = 0;
static String[] myArray = {"Seeded", "3000", "random", 25, "(44)", "1500", "random", "(13)"};
static String[] events = {"400", "800", "1500", "3000", "5000", "10000", "200.000"};

public static void main(String[] args) {

    //Read the PDF file into instance variable file
    readFile();

    try {
        parsePDF(file);
    } catch (IOException e) {
        e.printStackTrace();
    }

    myArray = parsedText.split(" ");
    removeWhiteSpace(myArray);

    String currentEvent = "";
    for (int i = 0; i < myArray.length; i++) {

        if (contains(myArray[i])) {
            currentEvent = myArray[i];
        }
        else if (!currentEvent.isEmpty()) {

            Integer value = extractNumber(myArray[i]);

            if (!myArray[i].isEmpty() && value!=null) {

                int distance = Integer.parseInt(currentEvent);
                sum += value.intValue() * distance;

                //reset currentEvent
                currentEvent = "";
            }
        }
    }
    //Print total distance in meters
    System.out.println(sum + " meters");

    //Convert meters to miles using the following equation: meters / 1609.344 
    System.out.println( Math.round((sum / 1609.344)) + " miles");
}

public static Integer extractNumber(String toCheck) {
    Pattern r = Pattern.compile("^.*?\\([^\\d]*(\\d+)[^\\d]*\\).*$");

    Matcher m = r.matcher(toCheck);
    if(m.find()) {
        return Integer.valueOf(m.group(1));
    }
    return null;
}

public static void removeWhiteSpace(String[] a) {
    for (int i = 0; i < myArray.length; i++) {
        //Trim off all extra whitespace
        myArray[i] = myArray[i].trim();
    }
}

结果是 151500 meters 94 miles


推荐阅读