[英]Airflow create looping task to run multiple time
I am new to airflow and wanted to run a bunch of task in a loop, however i am facing cyclic error.我是 airflow 的新手,想循环运行一堆任务,但是我面临循环错误。
from airflow import DAG
from airflow.contrib.operators.ssh_operator import SSHOperator
from airflow.operators.dummy import DummyOperator
from airflow.operators.python_operator import PythonOperator
from airflow.contrib.hooks.ssh_hook import SSHHook
from datetime import timedelta
from datetime import datetime
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': datetime(2021, 4, 13),
'email': ['raff@abc.com', 'raffg@abc.com'],
'email_on_failure': True,
'email_on_retry': False,
'retries': 0,
'retry_delay': timedelta(minutes=5),
}
dag = DAG('sparktestingforstandalone',
schedule_interval='@yearly',
default_args=default_args,
catchup=False
)
sshHook = SSHHook('conn_ssh_sparkstandalone')
linux_command_1 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task1.py '
linux_command_2 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task2.py '
linux_command_3 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task3.py '
linux_command_4 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task4.py '
linux_command_5 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task5.py '
linux_command_6 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task6.py '
linux_command_7 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task7.py '
start_op = DummyOperator(task_id='start_spark_runs',dag=dag)
t1 = SSHOperator(
ssh_hook=sshHook,
task_id='File_Extract_useCase',
command=linux_command_1,
dag=dag)
t1_1 = SSHOperator(
ssh_hook=sshHook,
task_id='File_Extract_useCase_1',
command=linux_command_1,
dag=dag)
t2 = SSHOperator(
ssh_hook=sshHook,
task_id='File_Extract_useCase_3',
command=linux_command_2,
dag=dag)
t2_1 = SSHOperator(
ssh_hook=sshHook,
task_id='File_Extract_useCase_12',
command=linux_command_2,
dag=dag)
t3 = SSHOperator(
ssh_hook=sshHook,
task_id='Join_useCase',
command=linux_command_3,
dag=dag)
t3_1 = SSHOperator(
ssh_hook=sshHook,
task_id='Join_useCase_1',
command=linux_command_3,
dag=dag)
t4 = SSHOperator(
ssh_hook=sshHook,
task_id='Denoramlize_usecase',
command=linux_command_5,
dag=dag)
t5 = SSHOperator(
ssh_hook=sshHook,
task_id='1798_useCase',
command=linux_command_5,
dag=dag)
t6 = SSHOperator(
ssh_hook=sshHook,
task_id='Json_Complex_Creation',
command=linux_command_6,
dag=dag)
t7 = SSHOperator(
ssh_hook=sshHook,
task_id='DB_to_DB_Creation',
command=linux_command_7,
dag=dag)
s1 = PythonOperator(task_id="delay_sleep_task_30sec",
dag=dag,
python_callable=lambda: time.sleep(30))
s1 = PythonOperator(task_id="delay_sleep_task_30sec_1",
dag=dag,
python_callable=lambda: time.sleep(30))
s2 = PythonOperator(task_id="delay_sleep_task_30sec_2",
dag=dag,
python_callable=lambda: time.sleep(30))
s3 = PythonOperator(task_id="delay_sleep_task_30sec_3",
dag=dag,
python_callable=lambda: time.sleep(30))
s4 = PythonOperator(task_id="delay_sleep_task_30sec_4",
dag=dag,
python_callable=lambda: time.sleep(30))
s5 = PythonOperator(task_id="delay_sleep_task_30sec_5",
dag=dag,
python_callable=lambda: time.sleep(30))
s6 = PythonOperator(task_id="delay_sleep_task_30sec_6",
dag=dag,
python_callable=lambda: time.sleep(30))
s7 = PythonOperator(task_id="delay_sleep_task_30sec_7",
dag=dag,
python_callable=lambda: time.sleep(30))
s8 = PythonOperator(task_id="delay_sleep_task_30sec_8",
dag=dag,
python_callable=lambda: time.sleep(30))
s9 = PythonOperator(task_id="delay_sleep_task_30sec_9",
dag=dag,
python_callable=lambda: time.sleep(30))
s10 = PythonOperator(task_id="delay_sleep_task_30sec_10",
dag=dag,
python_callable=lambda: time.sleep(30))
s11 = PythonOperator(task_id="delay_sleep_task_30sec_11",
dag=dag,
python_callable=lambda: time.sleep(30))
s12 = PythonOperator(task_id="delay_sleep_task_30sec_12",
dag=dag,
python_callable=lambda: time.sleep(30))
end_op = DummyOperator(task_id='end_spark_runs', dag=dag)
start_op >> t1 >> t1_1 >> end_op
start_op >> t2 >> t2_1 >> end_op
start_op >> t5 >> end_op
start_op >> t7 >> s1 >> t7 >> s2 >> t7 >> s3 >> end_op
start_op >> [t3,t4,t6] >> s4 >> [t3,t4,t6] >> s5 >> [t3,t4,t6] >> s6 >> [t3,t4,t6] >> s7 >> [t3,t4,t6] >> s8 >> [t3,t4,t6] >> s9 >> [t3,t4,t6] >> s10 >> [t3,t4,t6] >> s11 >> [t3,t4,t6] >> s12 >> end_op
I know this is messy, is there an elegant way to implement the same.我知道这很麻烦,有没有一种优雅的方法来实现它。
I wanted to run t3,t4,t6 task parallelly in a loop for n times and sleep 30 seconds between each runs.我想在循环中并行运行 t3,t4,t6 任务 n 次,并在每次运行之间休眠 30 秒。 There are multiple other task like t7 also to be triggered.
还有多个其他任务,如 t7 也将被触发。
I wanted to trigger few tasks single time, few task multiple times in a dag, i dont want to create that many instance as i have done here, need to do it in elegant manner as mentioned.我想一次触发几个任务,一次多次触发几个任务,我不想像我在这里所做的那样创建那么多实例,需要以如上所述的优雅方式来完成。
You cant make loops in a DAG Airflow, by definition a DAG is a Directed Acylic Graph .您不能在 DAG Airflow 中创建循环,根据定义,DAG 是有向无环图。
But you can use TriggerDagRunOperator .但是您可以使用TriggerDagRunOperator 。 Which will trigger a DagRun of your defined DAG.
这将触发您定义的 DAG 的 DagRun。
def dag_run_payload(context, dag_run_obj):
# You can add the data of dag_run.conf in here
# use your context information and add it to the
# dag_run_obj.payload
dag_run_obj.payload = {}
trigger_next_iter = TriggerDagRunOperator(
dag=dag,
task_id='trigger_next_iter',
trigger_dag_id='sparktestingforstandalone', # Or any other DAG
execution_date="{{ ti.xcom_pull(...) }}", # Its templated
python_executable=dag_run_payload
)
end_op >> trigger_next_iter
You can attach the Trigger at the end of your DAG.您可以在 DAG 的末尾附加触发器。
Note : This will not work for Airflow 2. TriggerDagRun changes in later versions, it does not provides python_executable, but you can still specify the dag_run configuration.注意:这不适用于 Airflow 2. TriggerDagRun 在以后的版本中发生了变化,它不提供 python_executable,但您仍然可以指定 dag_run 配置。
Maybe you need to check out the other capabilities of Airflow to accomplish your goal.也许您需要查看 Airflow 的其他功能来实现您的目标。
on_success_callback
of your operator will make your DAG less clutter. on_success_callback
将使您的 DAG 不那么混乱。 You can add your lambda: time.sleep(30)
here.lambda: time.sleep(30)
。[t3, t4, t6]
tasks.[t3, t4, t6]
任务分组。 And add a on_success_callback
with the wait time.on_success_callback
。def subdag(parent_dag_name, child_dag_name, args):
# Your SubDag definition here.
section_1 = SubDagOperator(
task_id='section-1',
subdag=subdag(DAG_NAME, 'section-1', args),
dag=dag,
on_success_callback=lambda: time.sleep(30)
)
I did using the for-loop generating the task names and appending to a list Runnin few task n, n+3,n+2 and n+10 times one after another - Solution just extended as found in Airflow rerun a single task multiple times on success我确实使用for循环生成任务名称并附加到一个列表中,一个接一个地运行几个任务n、n+3、n+2和n+10次-解决方案刚刚扩展,如Airflow中发现的那样多次重新运行单个任务关于成功
from airflow import DAG
from airflow.contrib.operators.ssh_operator import SSHOperator
from airflow.operators.dummy import DummyOperator
from airflow.operators.python_operator import PythonOperator
from airflow.contrib.hooks.ssh_hook import SSHHook
from datetime import timedelta
from datetime import datetime
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': datetime(2021, 4, 13),
'email': ['raff@abc.com', 'raffg@abc.com'],
'email_on_failure': True,
'email_on_retry': False,
'retries': 0,
'retry_delay': timedelta(minutes=5),
}
dag = DAG('sparktestingforstandalone',
schedule_interval='@yearly',
default_args=default_args,
catchup=False
)
sshHook = SSHHook('conn_ssh_sparkstandalone')
linux_command_1 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task1.py '
linux_command_2 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task2.py '
linux_command_3 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task3.py '
linux_command_4 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task4.py '
linux_command_5 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task5.py '
linux_command_6 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task6.py '
linux_command_7 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task7.py '
start_op = DummyOperator(task_id='start_spark_runs',dag=dag)
t5 = SSHOperator(
ssh_hook=sshHook,
task_id='nonloop_usecase',
command=linux_command_5,
dag=dag)
chain_operators = []
chain_operators.append(start_op)
chain_operators_1 = []
chain_operators_1.append(start_op)
chain_operators_2 = []
chain_operators_2.append(start_op)
chain_operators_3 = []
chain_operators_3.append(start_op)
chain_operators_4 = []
chain_operators_4.append(start_op)
chain_operators_5 = []
chain_operators_5.append(start_op)
max_attempt = 10
for attempt in range(max_attempt):
data_pull = SSHOperator(
ssh_hook=sshHook,
task_id='Usecase_run10_task_2_{}'.format(attempt),
command=linux_command_3,
dag=dag
)
data_pull_2 = SSHOperator(
ssh_hook=sshHook,
task_id='Usecase_run10_task_1_{}'.format(attempt),
command=linux_command_4,
dag=dag
)
data_pull_3 = SSHOperator(
ssh_hook=sshHook,
task_id='Usecase_run10_task_1{}'.format(attempt),
command=linux_command_6,
dag=dag
)
chain_operators.append(data_pull)
chain_operators_1.append(data_pull_2)
chain_operators_2.append(data_pull_3)
max_attempt_1 = 2
for attempt in range(max_attempt_1):
data_pull_4 = SSHOperator(
ssh_hook=sshHook,
task_id='Usecase_runtwice_task_2_{}'.format(attempt),
command=linux_command_1,
dag=dag
)
data_pull_5 = SSHOperator(
ssh_hook=sshHook,
task_id='Usecase_runtwice_task_1_{}'.format(attempt),
command=linux_command_2,
dag=dag
)
chain_operators_3.append(data_pull_4)
chain_operators_4.append(data_pull_5)
max_attempt_2 = 3
for attempt in range(max_attempt_2):
data_pull_6 = SSHOperator(
ssh_hook=sshHook,
task_id='Usecase_runthrice_{}'.format(attempt),
command=linux_command_7,
dag=dag
)
chain_operators_5.append(data_pull_6)
end_op = DummyOperator(task_id='end_spark_runs', dag=dag)
chain_operators_1.append(end_op)
chain_operators_2.append(end_op)
chain_operators_3.append(end_op)
chain_operators_4.append(end_op)
chain_operators_5.append(end_op)
chain_operators.append(end_op)
for i,val in enumerate(chain_operators[:-1]):
val.set_downstream(chain_operators[i+1])
for j,val in enumerate(chain_operators_1[:-1]):
val.set_downstream(chain_operators_1[j+1])
for k,val in enumerate(chain_operators_2[:-1]):
val.set_downstream(chain_operators_2[k+1])
start_op >> t5 >> end_op
start_op >> t7 >> end_op
for l,val in enumerate(chain_operators_3[:-1]):
val.set_downstream(chain_operators_3[l+1])
for m,val in enumerate(chain_operators_4[:-1]):
val.set_downstream(chain_operators_4[m+1])
for n,val in enumerate(chain_operators_5[:-1]):
val.set_downstream(chain_operators_5[n+1])
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.