如何在 airflow 中将变量从一个任务传递到另一个任务

Question

下面的代码有效，但我的要求是将 totalbuckets 作为输入传递给 function 而不是全局变量。 我无法将其作为变量传递并在下一个任务中执行 xcom_pull。 这个 dag 基本上根据输入的数量创建桶，totalbuckets 是一个常数。 提前感谢您的帮助。

from airflow import DAG    
from airflow.operators.python import PythonOperator, BranchPythonOperator   
with DAG('test-live', catchup=False, schedule_interval=None, default_args=args) as test_live:


totalbuckets = 3


# branches based on number of buckets
    def branch_buckets(**context):

        buckets = defaultdict(list)
        for i in range(len(inputs_to_process)):
            buckets[f'bucket_{(1+i % totalbuckets)}'].append(inputs_to_process[i])
      
        for bucket_name, input_sublist in buckets.items():
            context['ti'].xcom_push(key = bucket_name, value = input_sublist)
        return list(buckets.keys())
    
    # BranchPythonOperator will launch the buckets and distributes inputs among the buckets
    branch_buckets = BranchPythonOperator(
        task_id='branch_buckets',
        python_callable=branch_buckets,
        trigger_rule=TriggerRule.NONE_FAILED,
        provide_context=True,
        dag=test_live
    )  
# update provider tables with merge sql
    def update_inputs(sf_conn_id, bucket_name, **context):
        input_sublist = context['ti'].xcom_pull(task_ids='branch_buckets', key=bucket_name)
        print(f"Processing inputs {input_sublist} in {bucket_name}")

        from custom.hooks.snowflake_hook import SnowflakeHook
        for p in input_sublist:
            merge_sql=f"""
            merge into ......"""

bucket_tasks = []
        for i in range(totalbuckets):
            task= PythonOperator(
                task_id=f'bucket_{i+1}',
                python_callable=update_inputs,
                provide_context=True,
                op_kwargs={'bucket_name':f'bucket_{i+1}','sf_conn_id': SF_CONN_ID},
                dag=test_live
            )
            bucket_tasks.append(task)

Answer 1

如果totalbuckets与其他运行不同，它应该是一个运行 conf 变量，您可以为从 UI、CLI、Airflow REST API 甚至 python API 创建的每次运行提供它。

from airflow import DAG
from airflow.operators.python import PythonOperator, BranchPythonOperator
from airflow.models.param import Param
with DAG(
    'test-live',
    catchup=False,
    schedule_interval=None,
    default_args=args,
    params={"totalbuckets": Param(default=3, type="integer")},
) as test_live:
    # branches based on number of buckets
    def branch_buckets(**context):

        buckets = defaultdict(list)
        for i in range(len(inputs_to_process)):
            buckets[f'bucket_{(1+i % int("{{ params.totalbuckets }}"))}'].append(inputs_to_process[i])

        for bucket_name, input_sublist in buckets.items():
            context['ti'].xcom_push(key = bucket_name, value = input_sublist)
        return list(buckets.keys())

    # BranchPythonOperator will launch the buckets and distributes inputs among the buckets
    branch_buckets = BranchPythonOperator(
        task_id='branch_buckets',
        python_callable=branch_buckets,
        trigger_rule=TriggerRule.NONE_FAILED,
        provide_context=True,
        dag=test_live
    )
    # update provider tables with merge sql
    def update_inputs(sf_conn_id, bucket_name, **context):
        input_sublist = context['ti'].xcom_pull(task_ids='branch_buckets', key=bucket_name)
        print(f"Processing inputs {input_sublist} in {bucket_name}")

        from custom.hooks.snowflake_hook import SnowflakeHook
        for p in input_sublist:
            merge_sql=f"""
                merge into ......"""

    bucket_tasks = []
    for i in range(int("{{ params.totalbuckets }}")):
        task= PythonOperator(
            task_id=f'bucket_{i+1}',
            python_callable=update_inputs,
            provide_context=True,
            op_kwargs={'bucket_name':f'bucket_{i+1}','sf_conn_id': SF_CONN_ID},
            dag=test_live
        )
        bucket_tasks.append(task)

运行它的示例：

airflow dags trigger --conf '{"totalbuckets": 10}' test-live

或者通过UI 。

更新：

如果它是 static，但与其他环境不同，它可以是一个Airflow 变量，并在使用 jinja 的任务中直接读取它，以避免在每个 Dag 文件处理时读取它。

但如果它完全是 static，最推荐的解决方案是像您一样使用 python 变量，因为要读取 dag run conf 和 Airflow 变量，task/dag 会向数据库发送查询。

Answer 2

@hussein awala 我正在做类似下面的事情，但无法解析 bucket_tasks 中的 totalbuckets

from airflow.operators.python import PythonOperator, BranchPythonOperator   
with DAG('test-live', catchup=False, schedule_interval=None, default_args=args) as test_live:


#totalbuckets = 3


    def branch_buckets(totalbuckets, **context):

        buckets = defaultdict(list)
        for i in range(len(inputs_to_process)):
            buckets[f'bucket_{(1+i % totalbuckets)}'].append(inputs_to_process[i])
      
        for bucket_name, input_sublist in buckets.items():
            context['ti'].xcom_push(key = bucket_name, value = input_sublist)
        return list(buckets.keys())
    
    # BranchPythonOperator will launch the buckets and distributes inputs among the buckets
    branch_buckets = BranchPythonOperator(
        task_id='branch_buckets',
        python_callable=branch_buckets,
        trigger_rule=TriggerRule.NONE_FAILED,
        provide_context=True, op_kwargs={'totalbuckets':3},
        dag=test_live
    )  
# update provider tables with merge sql
    def update_inputs(sf_conn_id, bucket_name, **context):
        input_sublist = context['ti'].xcom_pull(task_ids='branch_buckets', key=bucket_name)
        print(f"Processing inputs {input_sublist} in {bucket_name}")

        from custom.hooks.snowflake_hook import SnowflakeHook
        for p in input_sublist:
            merge_sql=f"""
            merge into ......"""

bucket_tasks = []
        for i in range(totalbuckets):
            task= PythonOperator(
                task_id=f'bucket_{i+1}',
                python_callable=update_inputs,
                provide_context=True,
                op_kwargs={'bucket_name':f'bucket_{i+1}','sf_conn_id': SF_CONN_ID},
                dag=test_live
            )
            bucket_tasks.append(task)```

如何在 airflow 中将变量从一个任务传递到另一个任务

问题描述

2 个解决方案

解决方案1
0 2023-01-09 21:01:56

更新：

解决方案2
-1 2023-01-09 22:00:00

如何在 airflow 中将变量从一个任务传递到另一个任务

问题描述

2 个解决方案

解决方案1 0 2023-01-09 21:01:56

更新：

解决方案2 -1 2023-01-09 22:00:00

解决方案1
0 2023-01-09 21:01:56

解决方案2
-1 2023-01-09 22:00:00