python - 数据流流作业 - 写入 BigQuery 时出错
问题描述
使用“FILE_LOADS”技术通过 Apache Beam Dataflow 作业写入 BigQuery 时遇到错误。Streaming INSERT(else 块)工作正常,如预期的那样。'FILE_LOAD'(如果块)失败,代码后面给出了下面的错误。GCS 存储桶上的临时文件是有效的 JSON 对象。
来自 Pub/Sub 的原始事件示例:
"{'event': 'test', 'entityId': 13615316690, 'eventTime': '2020-08-12T15:56:07.130899+00:00', 'targetEntityId': 8947793, 'targetEntityType': 'item', 'entityType': 'guest', 'properties': {}}"
"{'event': 'test', 'entityId': 13615316690, 'eventTime': '2020-08-12T15:56:07.130899+00:00', 'targetEntityId': 8947793, 'targetEntityType': 'item', 'entityType': 'guest', 'properties': {‘action’: ‘delete’}}"
from __future__ import absolute_import
import logging
import sys
import traceback
import argparse
import ast
import json
import datetime
import dateutil.parser as date_parser
import apache_beam as beam
import apache_beam.pvalue as pvalue
from google.cloud.bigquery import CreateDisposition, WriteDisposition
from apache_beam.io.gcp.bigquery_tools import RetryStrategy
def get_values(element):
# convert properties from dict to arr of dicts to form a repeatable bq table record
prop_list = [{'property_name': k, 'property_value': v} for k, v in element['properties'].items()]
date_parsed = date_parser.parse(element.get('eventTime'))
event_time = date_parsed.strftime('%Y-%m-%d %H:%M:00')
raw_value = {'event': element.get('event'),
'entity_type': element.get('entityType'),
'entity_id': element.get('entityId'),
'target_entity_type': element.get('targetEntityType'),
'target_entity_id': element.get('targetEntityId'),
'event_time': event_time,
'properties': prop_list
}
return raw_value
def stream_to_bq(c: dict):
argv = [
f'--project={c["PROJECT"]}',
f'--runner=DataflowRunner',
f'--job_name={c["JOBNAME"]}',
f'--save_main_session',
f'--staging_location=gs://{c["BUCKET_NAME"]}/{c["STAGING_LOCATION"]}',
f'--temp_location=gs://{c["BUCKET_NAME"]}/{c["TEMP_LOCATION"]}',
f'--network={c["NETWORKPATH"]}',
f'--subnetwork={c["SUBNETWORKPATH"]}',
f'--region={c["REGION"]}',
f'--service_account_email={c["SERVICE_ACCOUNT"]}',
# f'--setup_file=./setup.py',
# f'--autoscaling_algorithm=THROUGHPUT_BASED',
# f'--maxWorkers=15',
# f'--experiments=shuffle_mode=service',
'--no_use_public_ips',
f'--streaming'
]
if c['FILE_LOAD']:
argv.append('--experiments=allow_non_updatable_job')
argv.append('--experiments=use_beam_bq_sink')
p = beam.Pipeline(argv=argv)
valid_msgs = (p
| 'Read from Pubsub' >>
beam.io.ReadFromPubSub(subscription=c['SUBSCRIPTION']).with_output_types(bytes)
)
records = (valid_msgs
| 'Event Parser(BQ Row) ' >> beam.Map(get_values)
)
# Load data to BigQuery using - 'Load Jobs' or 'Streaming Insert', choice based on latency expectation.
if c['FILE_LOAD']:
records | 'Write Result to BQ' >> beam.io.WriteToBigQuery(c["RAW_TABLE"],
project=c["PROJECT"],
dataset=c["DATASET_NAME"],
method='FILE_LOADS',
triggering_frequency=c['FILE_LOAD_FREQUENCY'],
create_disposition=CreateDisposition.CREATE_NEVER,
write_disposition=WriteDisposition.WRITE_APPEND
)
else:
records | 'Write Result to BQ' >> beam.io.WriteToBigQuery(c["RAW_TABLE"],
project=c["PROJECT"],
dataset=c["DATASET_NAME"],
create_disposition=CreateDisposition.CREATE_NEVER,
write_disposition=WriteDisposition.WRITE_APPEND,
insert_retry_strategy=RetryStrategy.RETRY_ON_TRANSIENT_ERROR
)
p.run()
来自数据流作业的错误:
message: 'Error while reading data, error message: JSON table encountered too many errors, giving up. Rows: 1; errors: 1. Please look into the errors[] collection for more details.' reason: 'invalid'> [while running 'generatedPtransform-1801'] java.util.concurrent.CompletableFuture.reportGet(CompletableFuture.java:357) java.util.concurrent.CompletableFuture.get(CompletableFuture.java:1895) org.apache.beam.sdk.util.MoreFutures.get(MoreFutures.java:57)
解决方案
这个问题看起来是 BigQuery 的错误负载。我的建议是尝试在 Dataflow 之外进行测试加载作业,以确保您的架构和数据结构正常。您可以遵循此 BQ 文档。
另外,我注意到您没有指定schema
nor SCHEMA_AUTODETECT
。我建议你指定它。
要了解错误,请尝试检查 Dataflow Jobs 日志,其中可能包含大量信息。如果您的加载作业失败,您可以在 BigQuery 中检查这些作业,它们还会为您提供有关失败原因的更多信息。您可以使用此 StackDriver 日志来查找 BQ 加载作业 ID:
resource.type="dataflow_step"
resource.labels.job_id= < YOUR DF JOB ID >
jsonPayload.message:("Triggering job" OR "beam_load")
我非常相信问题是由于重复字段properties
或架构的问题而发生的,考虑到它仅在加载作业时失败,架构似乎更有可能(也许该表的架构是错误的)。无论如何,在这里你有一个工作管道,我在我这边测试了它并且两个 BQ 插入工作:
schema = {
"fields":
[
{
"name": "name",
"type": "STRING"
},
{
"name": "repeated",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "spent",
"type": "INTEGER"
},
{
"name": "ts",
"type": "TIMESTAMP"
}
]
}
]
}
def fake_parsing(element):
# Using a fake parse so it's easier to reproduce
properties = []
rnd = random.random()
if rnd < 0.25:
dict_prop = {"spent": random.randint(0, 100),
"ts": datetime.now().strftime('%Y-%m-%d %H:%M:00')}
properties.append(dict_prop)
elif rnd > 0.75:
# repeated
dict_prop = {"spent": random.randint(0, 100),
"ts": datetime.now().strftime('%Y-%m-%d %H:%M:00')}
properties += [dict_prop, dict_prop]
elif 0.5 > rnd > 0.75:
properties.append({"ts": datetime.now().strftime('%Y-%m-%d %H:%M:00')})
return {"name": 'inigo',
"repeated": properties}
pubsub = (p | "Read Topic" >> ReadFromPubSub(topic=known_args.topic)
| "To Dict" >> beam.Map(fake_parsing))
pubsub | "Stream To BQ" >> WriteToBigQuery(
table=f"{known_args.table}_streaming_insert",
schema=schema,
write_disposition=BigQueryDisposition.WRITE_APPEND,
method="STREAMING_INSERTS")
pubsub | "Load To BQ" >> WriteToBigQuery(
table=f"{known_args.table}_load_job",
schema=schema,
write_disposition=BigQueryDisposition.WRITE_APPEND,
method=WriteToBigQuery.Method.FILE_LOADS,
triggering_frequency=known_args.triggering,
insert_retry_strategy="RETRY_ON_TRANSIENT_ERROR")
我建议您尝试管道的一部分,而不是一次全部尝试,即首先尝试加载作业,如果它们失败,检查它们失败的原因(在 Dataflow 日志、BigQuery 日志或 BigQuery UI 中)。完成后,添加流式插入(或相反)。