I have a main dag which retrieves a file and splits the data in this file to separate csv files. I have another set of tasks that must be done for each file of these csv files.
I tried creating subdag
s dynamically as follows
# create and return and DAG
def create_subdag(dag_parent, dag_id_child_prefix, db_name):
# dag params
dag_id_child = '%s.%s' % (dag_parent.dag_id, dag_id_child_prefix + db_name)
default_args_copy = default_args.copy()
# dag
dag = DAG(dag_id=dag_id_child,
default_args=default_args_copy,
schedule_interval='@once')
# operators
tid_check = 'check2_db_' + db_name
py_op_check = PythonOperator(task_id=tid_check, dag=dag,
python_callable=check_sync_enabled,
op_args=[db_name])
tid_spark = 'spark2_submit_' + db_name
py_op_spark = PythonOperator(task_id=tid_spark, dag=dag,
python_callable=spark_submit,
op_args=[db_name])
py_op_check >> py_op_spark
return dag
# wrap DAG into SubDagOperator
def create_subdag_operator(dag_parent, db_name):
tid_subdag = 'subdag_' + db_name
subdag = create_subdag(dag_parent, tid_prefix_subdag, db_name)
sd_op = SubDagOperator(task_id=tid_subdag, dag=dag_parent, subdag=subdag)
return sd_op
# create SubDagOperator for each db in db_names
def create_all_subdag_operators(dag_parent, db_names):
subdags = [create_subdag_operator(dag_parent, db_name) for db_name in db_names]
# chain subdag-operators together
airflow.utils.helpers.chain(*subdags)
return subdags
# (top-level) DAG & operators
dag = DAG(dag_id=dag_id_parent,
default_args=default_args,
schedule_interval=None)
subdag_ops = create_subdag_operators(dag, db_names)
Note that the list of inputs for which subdag
s are created, here db_names
, can either be declared statically in the python
file or could be read from external source.
The resulting DAG
looks like this
Diving into SubDAG
(s)