简体   繁体   中英

How to test a transformation in Palantir Foundry?

We try to create a test function for the whole transformation.

import os
from transforms.verbs.testing.TransformRunner import TransformRunner
from transforms.api import Pipeline
from .myproject.datasets import my_transform

# This assumes your test data exists in the folder /test/fixtures/data/ within the repo next to this test
TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), 'fixtures', 'data')

def test_my_transform(spark_session):
    pipeline = Pipeline()
    pipeline.add_transforms(my_transform)

    runner = TransformRunner(pipeline, '/my_fabulous_project', TEST_DATA_DIR)

    output = runner.build_dataset(spark_session, '/my_fabulous_project/output/test')
    assert output.first()['col_c'] == 3

Based on the documentation and this post , we tried to modify the import of the function, but we always get one of these errors:

transforms._errors.TransformTypeError: Expected arguments to be of type <class 'transforms.api._transform.Transform'>

ModuleNotFoundError: No module named 'test.myproject'

ValueError: attempted relative import beyond top-level package

How to create a working end-to-end testing function for a transformation?

This works for functions decorated both with @transform and @transform_df :

my_transform.py is located in the repository in src/myproject/datasets folder.

from transforms.api import Input, Output, transform_df
from pyspark.sql import functions as F


@transform_df(
    Output('/some_foundry_path/my_dir/out'),
    input_a=Input('/some_foundry_path/my_dir/in'))
def compute_sum(input_a):
    df = input_a.withColumn('col_c', F.col('col_a') + F.col('col_b'))
    return df

Input file:
在此处输入图像描述

test_my_transform.py is located in the repository in src/test folder.

import os
from transforms.api import Pipeline
from transforms.verbs.testing.TransformRunner import TransformRunner
from myproject.datasets.my_transform import compute_sum

# Taking this .py file's dir and adding the path to the test data
TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), 'fixtures/data/input')


def test_compute_sum(spark_session):
    pipeline = Pipeline()
    pipeline.add_transforms(compute_sum)

    runner = TransformRunner(
        pipeline,
        '/some_foundry_path/my_dir/',
        TEST_DATA_DIR
    )
    output = runner.build_dataset(
        spark_session,
        '/some_foundry_path/my_dir/out'
    )
    assert output.head()['col_c'] == 1

Test CSV file ( in.csv - it has the same name in as transformation Input) is created inside the repository:
存储库中的测试文件

col_a,col_b
0,1

Note:

for all the inputs

Input path ( /some_foundry_path/my_dir/in )
less
TransformRunner's raw_path_prefix argument ( /some_foundry_path/my_dir/ )

should be equal to

test file full path ( ...src/test/fixtures/data/input/in )
less
TEST_DATA_DIR ( ...src/test/fixtures/data/input )


To make tests run automatically together with checks, uncomment the following line in transforms-python/build.gradle :

在此处输入图像描述

I have tried the in-memory approach on a incremental transform and im receiving the error below. Any idea what i might be doing wrong.

def test_transformdata_incr(spark_session):

df_input = spark_session.createDataFrame([
    (1, 'abc', 1000, '2022-08-01'),
    (2, 'abc', 2000, '2022-08-02'),
    (3, 'def', 2000, '2022-08-01')
], ['a', 'b', 'c', 'd'])

df_expected = spark_session.createDataFrame([
    ('abc', 3000, '2022-08-12 15:10:00'),
    ('def', 2000, '2022-08-12 15:10:00')
], ['b', 's_colc', 'ts'])

transform_input = '/data/sampledata'
transform_output = '/data/sampledata_output'

pipeline = Pipeline()
pipeline.add_transforms(compute)
store = InMemoryDatastore()
store.store_dataframe(transform_input, df_input)
runner = TransformRunner(pipeline, datastore=store)
df_output = runner.build_dataset(spark_session, transform_output)

assert df_output.subtract(df_expected).count() == 0
assert df_expected.subtract(df_output).count() == 0
assert df_output.schema == df_expected.schema

Error:

 > df_output = runner.build_dataset(spark_session, transform_output) myproject/tests/test_transformdata_incr.py:30: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _../build/conda/env/lib/python3.8/site-packages/transforms/verbs/testing/TransformRunner.py:74: in build_dataset self._build_dataset_recursive(spark_session, alias)../build/conda/env/lib/python3.8/site-packages/transforms/verbs/testing/TransformRunner.py:84: in _build_dataset_recursive self._build_derived(spark_session, alias)../build/conda/env/lib/python3.8/site-packages/transforms/verbs/testing/TransformRunner.py:107: in _build_derived transform.compute(**kwargs) _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ self = <transforms.api._incremental._IncrementalCompute object at 0x7f1b6f9afd60> ctx = <transforms.verbs.testing.dummies.DummyTransformContext object at 0x7f1b6e85c790> transform_ios = {'output': <transforms.verbs.testing.dummies.DummyTransformOutput object at 0x7f1b6e85c730>, 'source_df': <transforms.verbs.testing.dummies.DummyTransformInput object at 0x7f1b6e85c070>} tinputs = {'source_df': <transforms.verbs.testing.dummies.DummyTransformInput object at 0x7f1b6e85c070>} toutputs = {} parameters = {'output': <transforms.verbs.testing.dummies.DummyTransformOutput object at 0x7f1b6e85c730>} def __call__(self, ctx=None, **transform_ios): # pylint: disable=arguments-differ """Run the computation by dynamically constructing IncrementalX objects from the general X objects. TransformInput -> IncrementalTransformInput TransformOutput -> IncrementalTransformOutput TransformContext -> IncrementalTransformContext """ tinputs = { name: tio for name, tio in transform_ios.items() if isinstance(tio, _transform.TransformInput) and not isinstance(tio, _transform.TransformOutput) } toutputs = { name: tio for name, tio in transform_ios.items() if isinstance(tio, _transform.TransformOutput) } parameters = { name: param for name, param in transform_ios.items() if not isinstance(param, _transform.TransformInput) and not isinstance(param, _transform.TransformOutput) } > foundry = list(toutputs.values())[0]._dfreader._foundry # Kinda horrible, but we grab a foundry instance E IndexError: list index out of range../build/conda/env/lib/python3.8/site-packages/transforms/api/_incremental.py:169: IndexError

After trying out several approaches with different conditions, the following approach seems cleanest to me.

  • no hard-coding paths to datasets
  • it is very explicit about adding/removing transformation inputs
  • in-memory dataframes are used as test inputs

test_my_transform.py

from transforms.api import Pipeline
from transforms.verbs.testing.TransformRunner import TransformRunner
from transforms.verbs.testing.datastores import InMemoryDatastore
from myproject.datasets.my_transform import compute_sum


def test_compute_sum(spark_session):

    df_input1 = spark_session.createDataFrame([
        (0, 2)
    ], ['col_a', 'col_b'])

    df_input2 = spark_session.createDataFrame([
        (0, 1)
    ], ['col_a', 'col_b'])

    df_expected = spark_session.createDataFrame([
        (0, 1, 1),
        (0, 2, 2)
    ], ['col_a', 'col_b', 'col_c'])

    # If @transform_df or @transform_pandas, the key is 'bound_output'
    # If @transform, the key is the name of variable Output
    output_map = {'out': df_expected}
    input_map = {
        'input_a': df_input1,
        'input_b': df_input2,
    }

    pipeline = Pipeline()
    pipeline.add_transforms(compute_sum)
    store = InMemoryDatastore()
    for inp_name, inp_obj in pipeline.transforms[0].inputs.items():
        store.store_dataframe(inp_obj.alias, input_map[inp_name])
    path_out = pipeline.transforms[0].outputs[list(output_map)[0]].alias
    runner = TransformRunner(pipeline, datastore=store)
    df_out = runner.build_dataset(spark_session, path_out)

    assert df_out.subtract(df_expected).count() == 0
    assert df_expected.subtract(df_out).count() == 0
    assert df_out.schema == df_expected.schema

my_transform.py

from transforms.api import Input, Output, transform
from pyspark.sql import functions as F


@transform(
    out=Output('/some_foundry_path/my_dir/out3'),
    input_a=Input('/some_foundry_path/my_dir/in'),
    input_b=Input('/some_foundry_path/my_dir/in2'))
def compute_sum(input_a, input_b, out):
    input_a = input_a.dataframe()
    input_b = input_b.dataframe()
    df = input_a.unionByName(input_b)
    df = df.withColumn('col_c', F.col('col_a') + F.col('col_b'))
    out.write_dataframe(df)

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM