airflow - 气流分支PythonOperator
问题描述
我正在尝试并行运行任务,但我知道 BranchPythonOperator 只返回一个分支。我的问题是,如有必要,我如何返回多个任务?这是我的一天:
如果我只有一个文件,它工作得很好,对于这种情况。但如果我有两个或更多文件,它只执行一项任务,并跳过所有其他任务。我想并行运行相关任务,如果我有 4 个文件,我需要并行运行它们并跳过其他文件。
我怎么能做这样的事情?
我的代码:
import datetime as dt
from airflow import DAG
import shutil
import os
from airflow.operators.bash_operator import BashOperator
from airflow.operators.python_operator import PythonOperator, BranchPythonOperator
from airflow.operators.dummy_operator import DummyOperator
from airflow.contrib.sensors.file_sensor import FileSensor
from airflow.operators.dagrun_operator import TriggerDagRunOperator
scriptAirflow = '/home/alexw/scriptAirflow/testFile/'
uploadPath='/apps/lv-manuf2020-data/80_DATA/00_Loading/'
receiptPath= '/apps/lv-manuf2020-data/80_DATA/01_Receipt/'
allReceiptFiles=os.listdir(receiptPath)
branchTask=['kpi_opj_data', 'material_mvke','material_mara','material_mbew','material_marm','material_mdma','material_marc','material_mard']
def parseFileName(file):
splitFile = file.split('_')
baseName= splitFile[2:]
newBaseName='_'.join(baseName)
formatDate= newBaseName.split('-')
baseFileName = formatDate[0].lower()
return baseFileName
def onlyCsvFiles():
if(os.listdir(uploadPath)):
for files in os.listdir(uploadPath):
if(files.startswith('MEM') and files.endswith('.csv') or files.startswith('FMS') and files.endswith('.csv')):
shutil.move(uploadPath+files, receiptPath)
print(files+' moved in ' + receiptPath+files)
for files in os.listdir(receiptPath):
if(files.startswith('MEM') and files.endswith('.csv') or files.startswith('FMS') and files.endswith('.csv')):
return "result_mv"
else:
return "no_file_timeout"
else:
print('No file in upload_00')
def result():
if allReceiptFiles:
mem_flag = False
fms_flag = False
for files in allReceiptFiles:
if (files.startswith('MEM') and files.endswith('.csv')):
mem_flag = True
if (files.startswith('FMS') and files.endswith('.csv')):
fms_flag = True
if mem_flag and fms_flag:
return "run_both_scripts"
if mem_flag:
return "run_for_mem"
if fms_flag:
return "run_for_fms"
else:
print('No script to launch')
pass
def returnGoodBranch():
checkScript=[]
for files in os.listdir(receiptPath):
newFiles = parseFileName(files)
checkScript.append(newFiles)
for scriptFiles in checkScript:
if scriptFiles.startswith(scriptFiles):
return scriptFiles
default_args = {
'owner': 'testParallel',
'start_date': dt.datetime(2020, 2, 17),
'retries': 1,
}
dag = DAG('testParallel', default_args=default_args, description='airflow_manuf2020_v4',
schedule_interval=None, catchup=False)
file_sensor = FileSensor(
task_id="file_sensor",
filepath=uploadPath,
fs_conn_id='airflow_db',
poke_interval=10,
dag=dag,
)
move_csv = BranchPythonOperator(
task_id='move_csv',
python_callable=onlyCsvFiles,
trigger_rule='none_failed',
dag=dag,
)
result_mv = BranchPythonOperator(
task_id='result_mv',
python_callable=result,
trigger_rule='none_failed',
dag=dag,
)
run_Mem_Script = DummyOperator(
task_id="run_for_mem",
dag=dag,
)
kpi_obj_data = BashOperator(
task_id='kpi_obj_data',
bash_command='python3 '+scriptAirflow+'kpi_obj_data.py "{{ execution_date }}"',
trigger_rule='one_success',
dag=dag,
)
run_Fms_Script = BranchPythonOperator(
task_id="run_for_fms",
python_callable=returnGoodBranch,
trigger_rule='all_success',
dag=dag,
)
material_makt = BashOperator(
task_id="material_makt",
bash_command='python3 '+scriptAirflow+'material_makt.py "{{ execution_date }}"',
trigger_rule='one_success',
dag=dag,
)
material_mara = BashOperator(
task_id="material_mara",
bash_command='python3 '+scriptAirflow+'material_mara.py "{{ execution_date }}"',
trigger_rule='one_success',
dag=dag,
)
material_marc = BashOperator(
task_id="material_marc",
bash_command='python3 '+scriptAirflow+'material_marc.py "{{ execution_date }}"',
trigger_rule='one_success',
dag=dag,
)
material_mard = BashOperator(
task_id="material_mard",
bash_command='python3 '+scriptAirflow+'material_mard.py "{{ execution_date }}"',
trigger_rule='one_success',
dag=dag,
)
material_marm = BashOperator(
task_id="material_marm",
bash_command='python3 '+scriptAirflow+'material_marm.py "{{ execution_date }}"',
trigger_rule='one_success',
dag=dag,
)
material_mbew = BashOperator(
task_id="material_mbew",
bash_command='python3 '+scriptAirflow+'material_mbew.py "{{ execution_date }}"',
trigger_rule='one_success',
dag=dag,
)
material_mdma = BashOperator(
task_id="material_mdma",
bash_command='python3 '+scriptAirflow+'material_mdma.py "{{ execution_date }}"',
trigger_rule='one_success',
dag=dag,
)
material_mvke = BashOperator(
task_id="material_mvke",
bash_command='python3 '+scriptAirflow+'material_mvke.py "{{ execution_date }}"',
trigger_rule='one_success',
dag=dag,
)
run_both_scripts = DummyOperator(
task_id="run_both_scripts",
dag=dag,
)
no_file_timeout= BashOperator(
task_id="no_file_timeout",
bash_command='sleep 300',
trigger_rule='all_done',
dag=dag,
)
rerun_dag_no_file = TriggerDagRunOperator(
task_id='rerun_dag_no_file',
trigger_dag_id='testParallel',
trigger_rule='all_success',
dag=dag,
)
checking_file= DummyOperator(
task_id='file_ok',
trigger_rule='all_done',
dag=dag,
)
rerun_dag=TriggerDagRunOperator(
task_id='rerun_dag',
trigger_dag_id='testParallel',
trigger_rule='all_done',
dag=dag,
)
move_csv.set_upstream(file_sensor)
result_mv.set_upstream(move_csv)
no_file_timeout.set_upstream(move_csv)
run_both_scripts.set_upstream(result_mv)
run_Fms_Script.set_upstream(result_mv)
run_Mem_Script.set_upstream(result_mv)
kpi_obj_data.set_upstream(run_Mem_Script)
kpi_obj_data.set_upstream(run_both_scripts)
material_makt.set_upstream(run_both_scripts)
material_mara.set_upstream(run_both_scripts)
material_marc.set_upstream(run_both_scripts)
material_mard.set_upstream(run_both_scripts)
material_marm.set_upstream(run_both_scripts)
material_mbew.set_upstream(run_both_scripts)
material_mdma.set_upstream(run_both_scripts)
material_mvke.set_upstream(run_both_scripts)
material_makt.set_upstream(run_Fms_Script)
material_mara.set_upstream(run_Fms_Script)
material_marc.set_upstream(run_Fms_Script)
material_mard.set_upstream(run_Fms_Script)
material_marm.set_upstream(run_Fms_Script)
material_mbew.set_upstream(run_Fms_Script)
material_mdma.set_upstream(run_Fms_Script)
material_mvke.set_upstream(run_Fms_Script)
checking_file.set_upstream(material_mvke)
checking_file.set_upstream(material_makt)
checking_file.set_upstream(material_mara)
checking_file.set_upstream(material_marc)
checking_file.set_upstream(material_mard)
checking_file.set_upstream(material_marm)
checking_file.set_upstream(material_mbew)
checking_file.set_upstream(material_mdma)
checking_file.set_upstream(material_mvke)
checking_file.set_upstream(kpi_obj_data)
rerun_dag.set_upstream(checking_file)
rerun_dag_no_file.set_upstream(no_file_timeout)
任务是 BashOperator 并调用我坚持的 python 脚本,如果有人有解决方案!多谢!
解决方案
BranchPythonOperaror 可以返回任务 ID 列表。例如,您要执行 material_marm、material_mbew 和 material_mdma,您只需要在您的 python 可调用函数中返回这些任务 ID。返回 [“material_marm”、“material_mbew”、“material_mdma”]
如果您想了解更多关于 BranchPythonOperator 的信息,请查看我的帖子,我相信它会对您有所帮助:)
推荐阅读
- go - 发出 http 请求后从响应头中获取 CSRF Token
- terraform - Terraform:使用变量创建多个指标过滤器和警报
- c# - 如何让我的文本框显示这个值?
- python - Discord Python bot不发送消息
- asp.net-core-3.1 - 如何在 Devart dotConnect for Orcale 中禁用缓存 - .netCore 3.1
- ios - SwiftUI:具有计算属性的 ObservableObject
- node.js - Heroku 上的 Babel 转译保留关键字“包”错误
- javascript - 如何替换嵌入式 svg 中的图像?
- django - Django 3.0.8 Field.disable UpdateView
- proxy - 自动代理http流量授权脚本