pdf - 谷歌脚本;不支持从格式 application/octet-stream 到 application/pdf 的转换
问题描述
我有一个与ERROR Get pdf-attachments from Gmail as text类似的错误,但在我的情况下它不起作用。驱动 API 的启用。文档的扩展名为 .pdf。此脚本适用于其他文档(也为 pdf)。Read pdf as text 来自Get pdf-attachments from Gmail as text
请帮忙。
这是我的代码:
function searchEmails() {
var threads = GmailApp.search('in:inbox newer_than:6d');
if (threads.length > 0) {
for (var t=threads.length-1; t>=0; t--) {
var thread = threads[t];
var message = thread.getMessages()[0];
var from = message.getFrom();
var subject = message.getSubject();
var to = message.getTo();
var date = message.getDate();
var body = message.getBody();
var attachments = message.getAttachments();
if (subject == 'subject') {
Messages(message)
}
}
}
}
function Messages(message) {
var attachments = message.getAttachments();
var blob = attachments[0].getAs(MimeType.PDF);
var body = message.getBody();
var filetext = pdfToText(blob);
filetext = filetext.substr(filetext.search("Title:"));
filetext = filetext.split(' ');
var msgValue = filetext[12];
var msgDate = filetext[6];
var msgID = message.getId();
// rest of my code
function pdfToText (blob, options) {
options = options || {};
var parents = [];
if (options.path) {
parents.push(getDriveFolderFromPath(options.path));
}
var pdfName = blob.getName();
var resource = {
title: pdfName,
mimeType: blob.getContentType(),
parents: parents
};
// Save PDF as GDOC
resource.title = pdfName.replace(/pdf$/, 'gdoc');
var insertOpts = {
ocr: true,
ocrLanguage: options.ocrLanguage || 'pl'
}
var gdocFile = Drive.Files.insert(resource, blob, insertOpts);
// Get text from GDOC
var gdocDoc = DocumentApp.openById(gdocFile.id);
var text = gdocDoc.getBody().getText();
// Delete document.
if (!options.keepGdoc) {
Drive.Files.remove(gdocFile.id);
}
return text;
}
function getDriveFolderFromPath (path) {
return (path || "/").split("/").reduce ( function(prev,current) {
if (prev && current) {
var fldrs = prev.getFoldersByName(current);
return fldrs.hasNext() ? fldrs.next() : null;
}
else {
return current ? null : prev;
}
},DriveApp.getRootFolder());
}
解决方案
首先,我的第一个代码有效。但是,如果电子邮件中有多个附件,您将遇到我最初遇到的问题。解决方案如下:
function searchEmails() {
var threads = GmailApp.search('in:inbox newer_than:6d');
if (threads.length > 0) {
for (var t=threads.length-1; t>=0; t--) {
var msgs = GmailApp.getMessagesForThreads(threads);
for (var i = 0 ; i < msgs.length; i++) {
for (var j = 0; j < msgs[i].length; j++) {
var attachments = msgs[i][j].getAttachments();
for (var k = 0; k < attachments.length; k++) {
var content = attachments[k].getContentType();
Logger.log(attachments[k].getName()) //check file extension
if (content == 'application/pdf'){
/*
you can check if the attachment has the expected name
var attachmentsName = attachments[k].getName();
if (attachmentsName == 'looking name'){
*/
var blob = attachments[k].getAs(MimeType.PDF);
var filetext = pdfToText(blob);
filetext = filetext.substr(filetext.search("SZCZEGÓŁY"));
filetext = filetext.split(' ');
var msgValue = filetext[14] + filetext[15];
var msgDate = filetext[6];
var type = filetext[3];
// rest of my code
function pdfToText (blob, options) {
options = options || {};
var parents = [];
if (options.path) {
parents.push(getDriveFolderFromPath(options.path));
}
var pdfName = blob.getName();
var resource = {
title: pdfName,
mimeType: blob.getContentType(),
parents: parents
};
// Save PDF as GDOC
resource.title = pdfName.replace(/pdf$/, 'gdoc');
var insertOpts = {
ocr: true,
ocrLanguage: options.ocrLanguage || 'pl'
}
var gdocFile = Drive.Files.insert(resource, blob, insertOpts);
// Get text from GDOC
var gdocDoc = DocumentApp.openById(gdocFile.id);
var text = gdocDoc.getBody().getText();
// Delete document.
if (!options.keepGdoc) {
Drive.Files.remove(gdocFile.id);
}
return text;
}
function getDriveFolderFromPath (path) {
return (path || "/").split("/").reduce ( function(prev,current) {
if (prev && current) {
var fldrs = prev.getFoldersByName(current);
return fldrs.hasNext() ? fldrs.next() : null;
}
else {
return current ? null : prev;
}
},DriveApp.getRootFolder());
}
推荐阅读
- node.js - 如何获取集合并加入参考字段中的数据?
- dynamic-programming - 到达第 n 个楼梯
- c - 在c中引用双指针的属性
- python - 如何检查输入后是否使用了字母?
- php - 如何将安装在主机服务器上的 phpmyadmin 连接到正在运行的 docker 容器内的 maria db?
- scala - 如何将案例类作为变量传递给ScalaReflection
- mongodb - 在对话中获取 2 人之间的最新消息 Mongo
- firebase - Firebase 功能未运行/记录
- python - 不和谐.py; 警告/罢工系统
- algorithm - 算法的大 O 时间复杂度