duplicates - 如果 pdf 包含“重复的书签”,如何使用 itext 7 按书签拆分 pdf 文件
问题描述
我正在尝试使用 itext7 按其书签拆分 pdf。
问题:如果 Pdf 在大纲树的其他地方有相同的书签,它就会被覆盖并且无法拆分。
重现问题的示例代码:
public void walkOutlines(PdfOutline outline, Map<String, PdfObject> names, PdfDocument pdfDocument,List<String>titles,List<Integer>pageNum) { //----------loop traversing all paths
for (PdfOutline child : outline.getAllChildren()){
if(child.getDestination() != null) {
prepareIndexFile(child,names,pdfDocument,titles,pageNum,list);
}
}
}
//------------Getting pageNumbers from outlines
public void prepareIndexFile(PdfOutline outline, Map<String, PdfObject> names, PdfDocument pdfDocument,List<String>titles,List<Integer>pageNum) {
String title = outline.getTitle();
PdfDestination pdfDestination = outline.getDestination();
String pdfStr = ((PdfString)pdfDestination.getPdfObject()).toUnicodeString();
PdfArray array = (PdfArray) names.get(pdfStr);
PdfObject pdfObj = array != null ? array.get(0) : null;
Integer pageNumber = pdfDocument.getPageNumber((PdfDictionary)pdfObj);
titles.add(title);
pageNum.add(pageNumber);
if(outline.getAllChildren().size() > 0) {
for (PdfOutline child : outline.getAllChildren()){
prepareIndexFile(child,names,pdfDocument,titles,pageNum);
}
}
}
public boolean splitPdf(String inputFile, final String outputFolder) {
boolean splitSuccess = true;
PdfDocument pdfDoc = null;
try {
PdfReader pdfReaderNew = new PdfReader(inputFile);
pdfDoc = new PdfDocument(pdfReaderNew);
final List<String> titles = new ArrayList<String>();
List<Integer> pageNum = new ArrayList<Integer>();
PdfNameTree destsTree = pdfDoc.getCatalog().getNameTree(PdfName.Dests);
Map<String, PdfObject> names = destsTree.getNames();//--------------------------------------Core logic for getting names
PdfOutline root = pdfDoc.getOutlines(false);//--------------------------------------Core logic for getting outlines
walkOutlines(root,names, pdfDoc, titles, pageNum,content); //------Logic to get bookmarks and pageNumbers
if (titles == null || titles.size()==0) {
splitSuccess = false;
}else { //------Proceed if it has bookmarks
for(int i=0;i<titles.size();i++) {
String title = titles.get(i);
String startPageNmStr =""+pageNum.get(i);
int startPage = Integer.parseInt(startPageNmStr);
int endPage = startPage;
if(i == titles.size() - 1) {
endPage = pdfDoc.getNumberOfPages();
}else {
int nextPage = pageNum.get(i+1);
if(nextPage > startPage) {
endPage = nextPage - 1;
}else {
endPage = nextPage;
}
}
String outFileName = outputFolder + File.separator + getFileName(title) + ".pdf";
PdfWriter pdfWriter = new PdfWriter(outFileName);
PdfDocument newDocument = new PdfDocument(pdfWriter, new DocumentProperties().setEventCountingMetaInfo(null));
pdfDoc.copyPagesTo(startPage, endPage, newDocument);
newDocument.close();
pdfWriter.close();
}
}
}catch(Exception e){
//---log
}
}
找到根本原因:在PdfNameTree items.put(name.toUnicodeString(), names.get(k));
如何克服这个问题?
提前致谢
解决方案
这部分代码:
PdfDestination pdfDestination = outline.getDestination();
String pdfStr = ((PdfString)pdfDestination.getPdfObject()).toUnicodeString();
PdfArray array = (PdfArray) names.get(pdfStr);
PdfObject pdfObj = array != null ? array.get(0) : null;
Integer pageNumber = pdfDocument.getPageNumber((PdfDictionary)pdfObj);
不考虑目的地可以不命名并显式引用页面的情况。
所以代码需要改编成如下代码:
PdfDestination pdfDestination = outline.getDestination();
PdfObject pdfObj = null;
if (pdfDestination.getPdfObject().isString()) {
String pdfStr = ((PdfString) pdfDestination.getPdfObject()).toUnicodeString();
PdfArray array = (PdfArray) names.get(pdfStr);
if (array != null) {
pdfObj = array.get(0);
}
} else if (pdfDestination.getPdfObject().isArray() && ((PdfArray)pdfDestination.getPdfObject()).get(0).isDictionary()) {
pdfObj = ((PdfArray)pdfDestination.getPdfObject()).get(0);
}
Integer pageNumber = pdfDocument.getPageNumber((PdfDictionary)pdfObj);
此外,如果要获取包含父链的完整标题名称,则需要替换String title = outline.getTitle();
为以下代码:
String title = outline.getTitle();
PdfOutline parentChain = outline.getParent();
while (parentChain != null) {
title = parentChain.getTitle() + "." + title;
parentChain = parentChain.getParent();
}
结果,我在输出目录中得到了 6 个文件,其中 5 个文件每个 1 页,一个文件 4 页。
完整代码:
public void walkOutlines(PdfOutline outline, Map<String, PdfObject> names, PdfDocument pdfDocument,
java.util.List<String>titles,java.util.List<Integer>pageNum) { //----------loop traversing all paths
for (PdfOutline child : outline.getAllChildren()){
if(child.getDestination() != null) {
prepareIndexFile(child,names,pdfDocument,titles,pageNum);
}
}
}
//------------Getting pageNumbers from outlines
public void prepareIndexFile(PdfOutline outline, Map<String, PdfObject> names, PdfDocument pdfDocument,
java.util.List<String>titles,java.util.List<Integer>pageNum) {
String title = outline.getTitle();
PdfOutline parentChain = outline.getParent();
while (parentChain != null) {
title = parentChain.getTitle() + "." + title;
parentChain = parentChain.getParent();
}
PdfDestination pdfDestination = outline.getDestination();
PdfObject pdfObj = null;
if (pdfDestination.getPdfObject().isString()) {
String pdfStr = ((PdfString) pdfDestination.getPdfObject()).toUnicodeString();
PdfArray array = (PdfArray) names.get(pdfStr);
if (array != null) {
pdfObj = array.get(0);
}
} else if (pdfDestination.getPdfObject().isArray() && ((PdfArray)pdfDestination.getPdfObject()).get(0).isDictionary()) {
pdfObj = ((PdfArray)pdfDestination.getPdfObject()).get(0);
}
Integer pageNumber = pdfDocument.getPageNumber((PdfDictionary)pdfObj);
titles.add(title);
pageNum.add(pageNumber);
if(outline.getAllChildren().size() > 0) {
for (PdfOutline child : outline.getAllChildren()){
prepareIndexFile(child,names,pdfDocument,titles,pageNum);
}
}
}
public void splitPdf(String inputFile, final String outputFolder) {
boolean splitSuccess = true;
PdfDocument pdfDoc = null;
try {
PdfReader pdfReaderNew = new PdfReader(inputFile);
pdfDoc = new PdfDocument(pdfReaderNew);
final java.util.List<String> titles = new ArrayList<String>();
java.util.List<Integer> pageNum = new ArrayList<Integer>();
PdfNameTree destsTree = pdfDoc.getCatalog().getNameTree(PdfName.Dests);
Map<String, PdfObject> names = destsTree.getNames();//--------------------------------------Core logic for getting names
PdfOutline root = pdfDoc.getOutlines(false);//--------------------------------------Core logic for getting outlines
walkOutlines(root,names, pdfDoc, titles, pageNum); //------Logic to get bookmarks and pageNumbers
if (titles == null || titles.size()==0) {
splitSuccess = false;
}else { //------Proceed if it has bookmarks
for(int i=0;i<titles.size();i++) {
String title = titles.get(i);
String startPageNmStr =""+pageNum.get(i);
int startPage = Integer.parseInt(startPageNmStr);
int endPage = startPage;
if(i == titles.size() - 1) {
endPage = pdfDoc.getNumberOfPages();
}else {
int nextPage = pageNum.get(i+1);
if(nextPage > startPage) {
endPage = nextPage - 1;
}else {
endPage = nextPage;
}
}
String outFileName = outputFolder + File.separator + title + ".pdf";
PdfWriter pdfWriter = new PdfWriter(outFileName);
PdfDocument newDocument = new PdfDocument(pdfWriter, new DocumentProperties().setEventCountingMetaInfo(null));
pdfDoc.copyPagesTo(startPage, endPage, newDocument);
newDocument.close();
pdfWriter.close();
}
}
}catch(IOException e){
System.out.println(e);
}
}
推荐阅读
- python - PythonOCC(OpenCascade)中平移和旋转的基本误区
- javascript - 在 React 中使用 createPortal 时,在挂载 DOM 之前触发 componentDidMount
- c++ - C++ 中的字数
- excel - 如何使用另一个工作表的多行中的值填充列(Excel)
- emacs - 如何屏蔽已经安装的 Emacs 包?
- python - SPARQL - 未知的命名空间前缀错误
- postgresql - 检测到 Postgres Npgsql.PostgresException 死锁
- python - 等待结果在 Python 数据抓取中加载
- javascript - 如何使用 jQuery 循环遍历 json 对象
- r - 在没有先验向量的情况下使用 dplyr 重新排列列(降序/升序)