简体   繁体   English

Airflow 创建循环任务运行多次

[英]Airflow create looping task to run multiple time

I am new to airflow and wanted to run a bunch of task in a loop, however i am facing cyclic error.我是 airflow 的新手,想循环运行一堆任务,但是我面临循环错误。

from airflow import DAG
from airflow.contrib.operators.ssh_operator import SSHOperator
from airflow.operators.dummy import DummyOperator
from airflow.operators.python_operator import PythonOperator
from airflow.contrib.hooks.ssh_hook import SSHHook
from datetime import timedelta
from datetime import datetime

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime(2021, 4, 13),
    'email': ['raff@abc.com', 'raffg@abc.com'],
    'email_on_failure': True,
    'email_on_retry': False,
    'retries': 0,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG('sparktestingforstandalone',
          schedule_interval='@yearly',
          default_args=default_args,
          catchup=False
          )

sshHook = SSHHook('conn_ssh_sparkstandalone')
linux_command_1 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task1.py '
linux_command_2 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task2.py '
linux_command_3 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task3.py '
linux_command_4 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task4.py '
linux_command_5 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task5.py '
linux_command_6 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task6.py '
linux_command_7 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task7.py '

start_op = DummyOperator(task_id='start_spark_runs',dag=dag)

t1 = SSHOperator(
    ssh_hook=sshHook,
    task_id='File_Extract_useCase',
    command=linux_command_1,
    dag=dag)

t1_1 = SSHOperator(
    ssh_hook=sshHook,
    task_id='File_Extract_useCase_1',
    command=linux_command_1,
    dag=dag)

t2 = SSHOperator(
    ssh_hook=sshHook,
    task_id='File_Extract_useCase_3',
    command=linux_command_2,
    dag=dag)

t2_1 = SSHOperator(
    ssh_hook=sshHook,
    task_id='File_Extract_useCase_12',
    command=linux_command_2,
    dag=dag)

t3 = SSHOperator(
    ssh_hook=sshHook,
    task_id='Join_useCase',
    command=linux_command_3,
    dag=dag)

t3_1 = SSHOperator(
    ssh_hook=sshHook,
    task_id='Join_useCase_1',
    command=linux_command_3,
    dag=dag)

t4 = SSHOperator(
    ssh_hook=sshHook,
    task_id='Denoramlize_usecase',
    command=linux_command_5,
    dag=dag)

t5 = SSHOperator(
    ssh_hook=sshHook,
    task_id='1798_useCase',
    command=linux_command_5,
    dag=dag)

t6 = SSHOperator(
    ssh_hook=sshHook,
    task_id='Json_Complex_Creation',
    command=linux_command_6,
    dag=dag)

t7 = SSHOperator(
    ssh_hook=sshHook,
    task_id='DB_to_DB_Creation',
    command=linux_command_7,
    dag=dag)

s1 = PythonOperator(task_id="delay_sleep_task_30sec",
                                dag=dag,
                                python_callable=lambda: time.sleep(30))

s1 = PythonOperator(task_id="delay_sleep_task_30sec_1",
                                dag=dag,
                                python_callable=lambda: time.sleep(30))
s2 = PythonOperator(task_id="delay_sleep_task_30sec_2",
                                dag=dag,
                                python_callable=lambda: time.sleep(30))
s3 = PythonOperator(task_id="delay_sleep_task_30sec_3",
                                dag=dag,
                                python_callable=lambda: time.sleep(30))
s4 = PythonOperator(task_id="delay_sleep_task_30sec_4",
                                dag=dag,
                                python_callable=lambda: time.sleep(30))
s5 = PythonOperator(task_id="delay_sleep_task_30sec_5",
                                dag=dag,
                                python_callable=lambda: time.sleep(30))
s6 = PythonOperator(task_id="delay_sleep_task_30sec_6",
                                dag=dag,
                                python_callable=lambda: time.sleep(30))
s7 = PythonOperator(task_id="delay_sleep_task_30sec_7",
                                dag=dag,
                                python_callable=lambda: time.sleep(30))
s8 = PythonOperator(task_id="delay_sleep_task_30sec_8",
                                dag=dag,
                                python_callable=lambda: time.sleep(30))
s9 = PythonOperator(task_id="delay_sleep_task_30sec_9",
                                dag=dag,
                                python_callable=lambda: time.sleep(30))
s10 = PythonOperator(task_id="delay_sleep_task_30sec_10",
                                dag=dag,
                                python_callable=lambda: time.sleep(30))
s11 = PythonOperator(task_id="delay_sleep_task_30sec_11",
                                dag=dag,
                                python_callable=lambda: time.sleep(30))
s12 = PythonOperator(task_id="delay_sleep_task_30sec_12",
                                dag=dag,
                                python_callable=lambda: time.sleep(30))


end_op = DummyOperator(task_id='end_spark_runs', dag=dag)

start_op >> t1 >> t1_1 >> end_op
start_op >> t2 >> t2_1 >> end_op
start_op >> t5 >> end_op
start_op >> t7 >> s1 >> t7 >> s2 >> t7 >> s3 >> end_op
start_op >> [t3,t4,t6] >> s4 >> [t3,t4,t6] >> s5 >> [t3,t4,t6] >> s6 >> [t3,t4,t6] >> s7 >> [t3,t4,t6] >> s8 >> [t3,t4,t6] >> s9 >> [t3,t4,t6] >> s10 >> [t3,t4,t6] >> s11 >> [t3,t4,t6] >> s12 >> end_op

I know this is messy, is there an elegant way to implement the same.我知道这很麻烦,有没有一种优雅的方法来实现它。

I wanted to run t3,t4,t6 task parallelly in a loop for n times and sleep 30 seconds between each runs.我想在循环中并行运行 t3,t4,t6 任务 n 次,并在每次运行之间休眠 30 秒。 There are multiple other task like t7 also to be triggered.还有多个其他任务,如 t7 也将被触发。

I wanted to trigger few tasks single time, few task multiple times in a dag, i dont want to create that many instance as i have done here, need to do it in elegant manner as mentioned.我想一次触发几个任务,一次多次触发几个任务,我不想像我在这里所做的那样创建那么多实例,需要以如上所述的优雅方式来完成。

You cant make loops in a DAG Airflow, by definition a DAG is a Directed Acylic Graph .您不能在 DAG Airflow 中创建循环,根据定义,DAG 是有向无环

But you can use TriggerDagRunOperator .但是您可以使用TriggerDagRunOperator Which will trigger a DagRun of your defined DAG.这将触发您定义的 DAG 的 DagRun。

def dag_run_payload(context, dag_run_obj):
    # You can add the data of dag_run.conf in here
    # use your context information and add it to the
    # dag_run_obj.payload
    dag_run_obj.payload = {}


trigger_next_iter = TriggerDagRunOperator(
     dag=dag,
     task_id='trigger_next_iter',
     trigger_dag_id='sparktestingforstandalone',  # Or any other DAG
     execution_date="{{ ti.xcom_pull(...) }}",  # Its templated
     python_executable=dag_run_payload
)

end_op >> trigger_next_iter

You can attach the Trigger at the end of your DAG.您可以在 DAG 的末尾附加触发器。

Note : This will not work for Airflow 2. TriggerDagRun changes in later versions, it does not provides python_executable, but you can still specify the dag_run configuration.注意:这不适用于 Airflow 2. TriggerDagRun 在以后的版本中发生了变化,它不提供 python_executable,但您仍然可以指定 dag_run 配置。

Another considerations另一个考虑因素

Maybe you need to check out the other capabilities of Airflow to accomplish your goal.也许您需要查看 Airflow 的其他功能来实现您的目标。

  • Using the on_success_callback of your operator will make your DAG less clutter. 使用您的运营商的on_success_callback将使您的 DAG 不那么混乱。 You can add your lambda: time.sleep(30) here.您可以在此处添加您的lambda: time.sleep(30)
  • Like creating SubDags .就像创建SubDags一样。 You can group you [t3, t4, t6] tasks.您可以将[t3, t4, t6]任务分组。 And add a on_success_callback with the wait time.并添加一个带有等待时间的on_success_callback
def subdag(parent_dag_name, child_dag_name, args):
    # Your SubDag definition here.

section_1 = SubDagOperator(
    task_id='section-1',
    subdag=subdag(DAG_NAME, 'section-1', args),
    dag=dag,
    on_success_callback=lambda: time.sleep(30)
)
  • Separate your current DAG logic in multiple DAG, and use the benefits of Airflow.将您当前的 DAG 逻辑分离到多个 DAG 中,并利用 Airflow 的优势。 Like creating Pools and restricting the max_active_runs of your DAG就像创建池和限制 DAG 的max_active_runs

I did using the for-loop generating the task names and appending to a list Runnin few task n, n+3,n+2 and n+10 times one after another - Solution just extended as found in Airflow rerun a single task multiple times on success我确实使用for循环生成任务名称并附加到一个列表中,一个接一个地运行几个任务n、n+3、n+2和n+10次-解决方案刚刚扩展,如Airflow中发现的那样多次重新运行单个任务关于成功

from airflow import DAG
from airflow.contrib.operators.ssh_operator import SSHOperator
from airflow.operators.dummy import DummyOperator
from airflow.operators.python_operator import PythonOperator
from airflow.contrib.hooks.ssh_hook import SSHHook
from datetime import timedelta
from datetime import datetime

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime(2021, 4, 13),
    'email': ['raff@abc.com', 'raffg@abc.com'],
    'email_on_failure': True,
    'email_on_retry': False,
    'retries': 0,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG('sparktestingforstandalone',
          schedule_interval='@yearly',
          default_args=default_args,
          catchup=False
          )

sshHook = SSHHook('conn_ssh_sparkstandalone')
linux_command_1 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task1.py '
linux_command_2 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task2.py '
linux_command_3 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task3.py '
linux_command_4 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task4.py '
linux_command_5 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task5.py '
linux_command_6 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task6.py '
linux_command_7 = 'spark-submit --conf "spark.cores.max=20" --conf "spark.executor.cores=2" --executor-memory 1G --driver-memory 2G /hadoopData/bdipoc/poc/python/task7.py '

start_op = DummyOperator(task_id='start_spark_runs',dag=dag)

t5 = SSHOperator(
    ssh_hook=sshHook,
    task_id='nonloop_usecase',
    command=linux_command_5,
    dag=dag)

chain_operators = []
chain_operators.append(start_op)
chain_operators_1 = []
chain_operators_1.append(start_op)
chain_operators_2 = []
chain_operators_2.append(start_op)
chain_operators_3 = []
chain_operators_3.append(start_op)
chain_operators_4 = []
chain_operators_4.append(start_op)
chain_operators_5 = []
chain_operators_5.append(start_op)
max_attempt = 10
for attempt in range(max_attempt):
    data_pull = SSHOperator(
        ssh_hook=sshHook,
        task_id='Usecase_run10_task_2_{}'.format(attempt),
        command=linux_command_3,
        dag=dag
    )
    data_pull_2 = SSHOperator(
        ssh_hook=sshHook,
        task_id='Usecase_run10_task_1_{}'.format(attempt),
        command=linux_command_4,
        dag=dag
    )
    data_pull_3 = SSHOperator(
        ssh_hook=sshHook,
        task_id='Usecase_run10_task_1{}'.format(attempt),
        command=linux_command_6,
        dag=dag
    )
    chain_operators.append(data_pull)
    chain_operators_1.append(data_pull_2)
    chain_operators_2.append(data_pull_3)
    

max_attempt_1 = 2
for attempt in range(max_attempt_1):
    data_pull_4 = SSHOperator(
        ssh_hook=sshHook,
        task_id='Usecase_runtwice_task_2_{}'.format(attempt),
        command=linux_command_1,
        dag=dag
    )
    data_pull_5 = SSHOperator(
        ssh_hook=sshHook,
        task_id='Usecase_runtwice_task_1_{}'.format(attempt),
        command=linux_command_2,
        dag=dag
    )
    chain_operators_3.append(data_pull_4)
    chain_operators_4.append(data_pull_5)

max_attempt_2 = 3
for attempt in range(max_attempt_2):
    data_pull_6 = SSHOperator(
        ssh_hook=sshHook,
        task_id='Usecase_runthrice_{}'.format(attempt),
        command=linux_command_7,
        dag=dag
    )
    chain_operators_5.append(data_pull_6)


end_op = DummyOperator(task_id='end_spark_runs', dag=dag)
chain_operators_1.append(end_op)
chain_operators_2.append(end_op)
chain_operators_3.append(end_op)
chain_operators_4.append(end_op)
chain_operators_5.append(end_op)
chain_operators.append(end_op)

for i,val in enumerate(chain_operators[:-1]):
    val.set_downstream(chain_operators[i+1])
for j,val in enumerate(chain_operators_1[:-1]):
    val.set_downstream(chain_operators_1[j+1])
for k,val in enumerate(chain_operators_2[:-1]):
    val.set_downstream(chain_operators_2[k+1])

start_op >> t5 >> end_op
start_op >> t7 >> end_op

for l,val in enumerate(chain_operators_3[:-1]):
    val.set_downstream(chain_operators_3[l+1])
for m,val in enumerate(chain_operators_4[:-1]):
    val.set_downstream(chain_operators_4[m+1])
for n,val in enumerate(chain_operators_5[:-1]):
    val.set_downstream(chain_operators_5[n+1])

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM