java - 尝试利用从 PDF 中捕获的文本
问题描述
我有一个名为的数组myArray
,其中包含由空格分隔的单词,并从 PDF 的第一页到最后一页进行修剪。我写了一个简单的print
数组方法,它遍历并逐个打印每个元素,看起来很棒!
在我让它通过另一个for loop
数组的长度并检查后立即将数组if (myArray[i].equals("(19)")) {//print something}
打印到控制台时,很明显该值(19)
存在于数组中。
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Scanner;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.io.RandomAccessRead;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
public class Main {
static File file;
static PDFTextStripper textStripper;
static PDDocument pdDoc;
static COSDocument cosDoc;
static String parsedText;
static int sum = 0;
static String[] myArray;
static String[] events = {"400", "800", "1500",
"3000", "5000", "10000"};
public static void main(String[] args) {
//Read the PDF file into instance variable file
readFile();
try {
parsePDF(file);
} catch (IOException e) {
e.printStackTrace();
}
myArray = parsedText.split(" ");
removeWhiteSpace(myArray);
printArray(myArray);
//System.out.println();
String currentEvent = "";
for (int i = 0; i < myArray.length; i++) {
if (contains(myArray[i])) {
currentEvent = myArray[i];
}
if (!currentEvent.equals("")) {
if (myArray[i].charAt(0) == '(' && (myArray[i].charAt(myArray[i].length() - 1) == ')')) {
String formatedRunners = "";
//It is possible to see some numbers such as (19)) or (19)
if (containsCharacter(myArray[i], ')') == 2) {
formatedRunners = myArray[i].substring(1, myArray[i].length() - 2);
} else {
formatedRunners = myArray[i].substring(1, myArray[i].length() - 1);
}
int numberOfRunners = Integer.parseInt(formatedRunners);
int distance = Integer.parseInt(currentEvent);
sum += numberOfRunners * distance;
//reset currentEvent
currentEvent = "";
}
}
}
//Print total distance in meters
System.out.println(sum + " meters");
//Convert meters to miles using the following equation: meters / 1609.344
System.out.println( Math.round((sum / 1609.344)) + " miles");
}
public static void readFile() {
Scanner c = new Scanner(System.in);
System.out.println("Enter a file path: ");
String filePath = c.nextLine();
file = new File(filePath);
}
public static void parsePDF(File file) throws IOException {
textStripper = new PDFTextStripper();
pdDoc = PDDocument.load(file);
//Parse PDF
textStripper.setStartPage(1);
//textStripper.setEndPage();
//Parsed String
parsedText = textStripper.getText(pdDoc);
}
public static boolean contains(String s) {
for (int i = 0; i < events.length; i++) {
if (s.equals(events[i])) {
return true;
}
}
return false;
}
public static void printArray(String[] a) {
for (int i = 0; i < a.length; i++) {
System.out.println(a[i]);
}
}
public static void removeWhiteSpace(String[] a) {
for (int i = 0; i < myArray.length; i++) {
if (myArray[i].equals("")) {
//Use some filler to avoid crashes when checking characters
myArray[i] = "NULL";
}
//Trim off all extra whitespace
myArray[i] = myArray[i].trim();
}
}
public static int containsCharacter(String str, char c) {
int count = 0;
for (int i = 0; i < str.length(); i++) {
if (str.charAt(i) == c) {
count++;
}
}
return count;
}
}
这是我想要的:
- 解析和修剪等(OK)
- 迭代
myArray
(在 main 方法中)并检测事件(OK) - 如果发生事件,则下一个值必须
(Any number)
类似于(19)
(NOK) - 步骤 3 中的数字将用于计算另一个数字
- 重置当前事件以一遍又一遍地重复该过程。
似乎它正在正确读取每个事件,但只选择 (19)) 而不是 (19)。
解决方案
您的代码中有几个问题(无异常处理、一切都是静态的、小错误等),但我将专注于主要问题。(我删除了我没有更改的代码)
public class Main {
static File file;
static PDFTextStripper textStripper;
static PDDocument pdDoc;
static COSDocument cosDoc;
static String parsedText;
static int sum = 0;
static String[] myArray = {"Seeded", "3000", "random", 25, "(44)", "1500", "random", "(13)"};
static String[] events = {"400", "800", "1500", "3000", "5000", "10000", "200.000"};
public static void main(String[] args) {
//Read the PDF file into instance variable file
readFile();
try {
parsePDF(file);
} catch (IOException e) {
e.printStackTrace();
}
myArray = parsedText.split(" ");
removeWhiteSpace(myArray);
String currentEvent = "";
for (int i = 0; i < myArray.length; i++) {
if (contains(myArray[i])) {
currentEvent = myArray[i];
}
else if (!currentEvent.isEmpty()) {
Integer value = extractNumber(myArray[i]);
if (!myArray[i].isEmpty() && value!=null) {
int distance = Integer.parseInt(currentEvent);
sum += value.intValue() * distance;
//reset currentEvent
currentEvent = "";
}
}
}
//Print total distance in meters
System.out.println(sum + " meters");
//Convert meters to miles using the following equation: meters / 1609.344
System.out.println( Math.round((sum / 1609.344)) + " miles");
}
public static Integer extractNumber(String toCheck) {
Pattern r = Pattern.compile("^.*?\\([^\\d]*(\\d+)[^\\d]*\\).*$");
Matcher m = r.matcher(toCheck);
if(m.find()) {
return Integer.valueOf(m.group(1));
}
return null;
}
public static void removeWhiteSpace(String[] a) {
for (int i = 0; i < myArray.length; i++) {
//Trim off all extra whitespace
myArray[i] = myArray[i].trim();
}
}
结果是
151500 meters
94 miles