parsing - 表格到 Bigquery - 设置最大允许错误
问题描述
我使用此脚本从 Google 表格上传数据。它在这里以某种方式,如何设置最大错误?我只想忽略所有错误并上传数据,无论有多少错误。我有很多不同的大表,它们每次都是另一种格式等。
我能够手动正确加载此数据(我只设置了 100 或 1000 个允许的错误)。但是这个脚本使用 autodetect:true 运行并且不允许出错。谢谢
/**
* Function to run from the UI menu.
*
* Uploads the sheets defined in the active sheet into BigQuery.
*/
function runFromUI() {
// Column indices.
const SHEET_URL = 1;
const PROJECT_ID = 2;
const DATASET_ID = 3;
const TABLE_ID = 4;
const APPEND = 5;
const STATUS = 6;
// Get the data range rows, skipping the header (first) row.
let sheet = SpreadsheetApp.getActiveSheet();
let rows = sheet.getDataRange().getValues().slice(1);
// Run the sheetToBigQuery function for every row and write the status.
rows.forEach((row, i) => {
let status = sheetToBigQuery(
row[SHEET_URL],
row[PROJECT_ID],
row[DATASET_ID],
row[TABLE_ID],
row[APPEND],
);
sheet.getRange(i+2, STATUS+1).setValue(status);
});
}
/**
* Uploads a single sheet to BigQuery.
*
* @param {string} sheetUrl - The Google Sheet Url containing the data to upload.
* @param {string} projectId - Google Cloud Project ID.
* @param {string} datasetId - BigQuery Dataset ID.
* @param {string} tableId - BigQuery Table ID.
* @param {bool} append - Appends to BigQuery table if true, otherwise replaces the content.
*
* @return {string} status - Returns the status of the job.
*/
function sheetToBigQuery(sheetUrl, projectId, datasetId, tableId, append) {
try {
createDatasetIfDoesntExist(projectId, datasetId);
} catch (e) {
return `${e}: Please verify your "Project ID" exists and you have permission to edit BigQuery`;
}
let sheet;
try {
sheet = openSheetByUrl(sheetUrl);
} catch (e) {
return `${e}: Please verify the "Sheet URL" is pasted correctly`;
}
// Get the values from the sheet's data range as a matrix of values.
let rows = sheet.getDataRange().getValues();
// Normalize the headers (first row) to valid BigQuery column names.
// https://cloud.google.com/bigquery/docs/schemas#column_names
rows[0] = rows[0].map((header) => {
header = header.toLowerCase().replace(/[^\w]+/g, '_');
if (header.match(/^\d/))
header = '_' + header;
return header;
});
// Create the BigQuery load job config. For more information, see:
// https://developers.google.com/apps-script/advanced/bigquery
let loadJob = {
configuration: {
load: {
destinationTable: {
projectId: projectId,
datasetId: datasetId,
tableId: tableId
},
autodetect: true, // Infer schema from contents.
writeDisposition: append ? 'WRITE_APPEND' : 'WRITE_TRUNCATE',
}
}
};
// BigQuery load jobs can only load files, so we need to transform our
// rows (matrix of values) into a blob (file contents as string).
// For convenience, we convert the rows into a CSV data string.
// https://cloud.google.com/bigquery/docs/loading-data-local
let csvRows = rows.map(values =>
// We use JSON.stringify() to add "quotes to strings",
// but leave numbers and booleans without quotes.
// If a string itself contains quotes ("), JSON escapes them with
// a backslash as \" but the CSV format expects them to be
// escaped as "", so we replace all the \" with "".
values.map(value => JSON.stringify(value).replace(/\\"/g, '""'))
);
let csvData = csvRows.map(values => values.join(',')).join('\n');
let blob = Utilities.newBlob(csvData, 'application/octet-stream');
// Run the BigQuery load job.
try {
BigQuery.Jobs.insert(loadJob, projectId, blob);
} catch (e) {
return e;
}
Logger.log(
'Load job started. Click here to check your jobs: ' +
`https://console.cloud.google.com/bigquery?project=${projectId}&page=jobs`
);
// The status of a successful run contains the timestamp.
// return `Last run: ${new Date().setDate }`;
return `last run: ${Utilities.formatDate(new Date(), SpreadsheetApp.getActive().getSpreadsheetTimeZone(), "yyyy-MM-dd HH:mm") }`;
}
/**
* Creates a dataset if it doesn't exist, otherwise does nothing.
*
* @param {string} projectId - Google Cloud Project ID.
* @param {string} datasetId - BigQuery Dataset ID.
*/
function createDatasetIfDoesntExist(projectId, datasetId) {
try {
BigQuery.Datasets.get(projectId, datasetId);
} catch (err) {
let dataset = {
datasetReference: {
projectId: projectId,
datasetId: datasetId,
},
};
BigQuery.Datasets.insert(dataset, projectId);
Logger.log(`Created dataset: ${projectId}:${datasetId}`);
}
}
/**
* Opens the spreadsheet sheet (tab) with the given URL.
*
* @param {string} sheetUrl - Google Sheet Url.
*
* @returns {Sheet} - The sheet corresponding to the URL.
*
* @throws Throws an error if the sheet doesn't exist.
*/
function openSheetByUrl(sheetUrl) {
// Extract the sheet (tab) ID from the Url.
let sheetIdMatch = sheetUrl.match(/gid=(\d+)/);
let sheetId = sheetIdMatch ? sheetIdMatch[1] : null;
// From the open spreadsheet, get the sheet (tab) that matches the sheetId.
let spreadsheet = SpreadsheetApp.openByUrl(sheetUrl);
let sheet = spreadsheet.getSheets().filter(sheet => sheet.getSheetId() == sheetId)[0];
if (!sheet)
throw 'Sheet tab ID does not exist';
return sheet;
}
解决方案
如果要设置最大错误数,可以在配置中使用该maxBadRecords
参数load
。如果您想完全忽略错误,可以改为设置ignoreUnknownValues
为true
。
let loadJob = {
configuration: {
load: {
destinationTable: {
projectId: projectId,
datasetId: datasetId,
tableId: tableId
},
autodetect: true, // Infer schema from contents.
// maxBadRecords: 1000,
ignoreUnknownValues: true, // use one or the other
writeDisposition: append ? 'WRITE_APPEND' : 'WRITE_TRUNCATE',
}
}
};
参考:
推荐阅读
- visual-studio - 在创建具有重叠控件的对话框时使用 Visual Studio 资源编辑器的任何提示?
- r - 在 R 中对大型数据集的行进行分组
- javascript - 用 D3 更新力有向图
- css - 带网格的响应列:3 个等宽列
- r - 如何在 R 中使用 lapply 多次运行具有来自不同数据帧的变量的模型
- objective-c - PDFView 获取/设置滚动位置
- reactjs - antd 表单使用复杂的多级嵌套对象进行初始化
- javascript - 如何在Javascript中将对象对象转换为对象数组
- java - 如何在 firebase 中实时检索此元素?
- rust - Rust - 没有为 `OsString` 实现特征`StdError`