首页 > 解决方案 > 谷歌脚本;不支持从格式 application/octet-stream 到 application/pdf 的转换

问题描述

我有一个与ERROR Get pdf-attachments from Gmail as text类似的错误,但在我的情况下它不起作用。驱动 API 的启用。文档的扩展名为 .pdf。此脚本适用于其他文档(也为 pdf)。Read pdf as text 来自Get pdf-attachments from Gmail as text

请帮忙。

这是我的代码:

function searchEmails() {

var threads = GmailApp.search('in:inbox newer_than:6d');

if (threads.length > 0) {
  for (var t=threads.length-1; t>=0; t--) {

    var thread = threads[t];
    var message = thread.getMessages()[0];
    var from = message.getFrom();
    var subject = message.getSubject();
    var to = message.getTo();
    var date = message.getDate();
    var body = message.getBody();
    var attachments = message.getAttachments();

    if (subject == 'subject') {
        Messages(message)
      }
    }
  }
}

function Messages(message) {

  var attachments = message.getAttachments();
  var blob = attachments[0].getAs(MimeType.PDF);
  var body = message.getBody();
  var filetext = pdfToText(blob);     
  filetext = filetext.substr(filetext.search("Title:"));
  filetext = filetext.split(' ');

  var msgValue = filetext[12];
  var msgDate = filetext[6];
  var msgID = message.getId(); 


// rest of my code

function pdfToText (blob, options) {

  options = options || {};
  var parents = [];
  if (options.path) {
    parents.push(getDriveFolderFromPath(options.path));
  }
  var pdfName = blob.getName();
  var resource = {
    title: pdfName,
    mimeType: blob.getContentType(),
    parents: parents
  };

  // Save PDF as GDOC
  resource.title = pdfName.replace(/pdf$/, 'gdoc');
  var insertOpts = {
    ocr: true,
    ocrLanguage: options.ocrLanguage || 'pl'
  }
  var gdocFile = Drive.Files.insert(resource, blob, insertOpts);

  // Get text from GDOC  
  var gdocDoc = DocumentApp.openById(gdocFile.id);
  var text = gdocDoc.getBody().getText();

  // Delete document.
  if (!options.keepGdoc) {
    Drive.Files.remove(gdocFile.id);
  }

  return text;
}

function getDriveFolderFromPath (path) {
  return (path || "/").split("/").reduce ( function(prev,current) {
    if (prev && current) {
      var fldrs = prev.getFoldersByName(current);
      return fldrs.hasNext() ? fldrs.next() : null;
    }
    else { 
      return current ? null : prev; 
    }
  },DriveApp.getRootFolder()); 
}

标签: pdfgoogle-apps-scriptgoogle-drive-api

解决方案


首先,我的第一个代码有效。但是,如果电子邮件中有多个附件,您将遇到我最初遇到的问题。解决方案如下:

function searchEmails() {

  var threads = GmailApp.search('in:inbox newer_than:6d');

    if (threads.length > 0) {
      for (var t=threads.length-1; t>=0; t--) {
        var msgs = GmailApp.getMessagesForThreads(threads);    

        for (var i = 0 ; i < msgs.length; i++) {    
          for (var j = 0; j < msgs[i].length; j++) {      
            var attachments = msgs[i][j].getAttachments();                

            for (var k = 0; k < attachments.length; k++) {          
              var content = attachments[k].getContentType();                
              Logger.log(attachments[k].getName()) //check file extension

            if (content == 'application/pdf'){
               

/* 
you can check if the attachment has the expected name
              var attachmentsName = attachments[k].getName();
              if (attachmentsName == 'looking name'){
*/

              var blob = attachments[k].getAs(MimeType.PDF);            
              var filetext = pdfToText(blob);            
              filetext = filetext.substr(filetext.search("SZCZEGÓŁY"));            
              filetext = filetext.split(' ');                        
              var msgValue = filetext[14] + filetext[15];            
              var msgDate = filetext[6];            
              var type = filetext[3];            

   // rest of my code

function pdfToText (blob, options) {

  options = options || {};
  var parents = [];
  if (options.path) {
    parents.push(getDriveFolderFromPath(options.path));
  }
  var pdfName = blob.getName();
  var resource = {
    title: pdfName,
    mimeType: blob.getContentType(),
    parents: parents
  };

  // Save PDF as GDOC
  resource.title = pdfName.replace(/pdf$/, 'gdoc');
  var insertOpts = {
    ocr: true,
    ocrLanguage: options.ocrLanguage || 'pl'
  }
  var gdocFile = Drive.Files.insert(resource, blob, insertOpts);

  // Get text from GDOC  
  var gdocDoc = DocumentApp.openById(gdocFile.id);
  var text = gdocDoc.getBody().getText();

  // Delete document.
  if (!options.keepGdoc) {
    Drive.Files.remove(gdocFile.id);
  }

  return text;
}

function getDriveFolderFromPath (path) {
  return (path || "/").split("/").reduce ( function(prev,current) {
    if (prev && current) {
      var fldrs = prev.getFoldersByName(current);
      return fldrs.hasNext() ? fldrs.next() : null;
    }
    else { 
      return current ? null : prev; 
    }
  },DriveApp.getRootFolder()); 
}

推荐阅读