![](/img/trans.png)
[英]How to properly test pcollection length when unit testing Apache Beam
[英]Unit testing apache beam job
我有一个非常简单的数据流工作,我想编写单元测试。 可悲的是,没有什么是最好的方法的好例子。
这是代码
import logging
from datetime import datetime
from re import sub
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import GoogleCloudOptions
from beam_nuggets.io import relational_db
class BeamOptions(PipelineOptions):
@classmethod
def _add_argparse_args(cls, parser):
parser.add_argument(
"--bigquery_input_table",
help="A table from BigQuery to process",
default="bigquery_input_table_empty",
)
parser.add_argument(
"--bigquery_project", help="Project with BigQuery data", default="foo"
)
parser.add_argument(
"--bigquery_dataset", help="Dataset from BigQuery", default="bar"
)
parser.add_argument(
"--bucket", help="GCS Bucket for temporary files", default="model-foo-dev"
)
parser.add_argument("--db_password", help="Password for profiles DB", default="postgres")
parser.add_argument("--db_host", help="host for profiles DB", default="localhost")
parser.add_argument("--db_port", help="port for profiles DB", default=5432)
class BeamFeed:
def __init__(self):
logging.info("fetching BeamOptions")
self.pipe_opt = BeamOptions()
all_options = self.pipe_opt.get_all_options()
self.bigquery_input_table = all_options["bigquery_input_table"]
self.bigquery_project = all_options["bigquery_project"]
self.bigquery_dataset = all_options["bigquery_dataset"]
self.google_cloud_options = self.pipe_opt.view_as(GoogleCloudOptions)
self.google_cloud_options.job_name = "model-foo-data-preparation-" + sub(
r"[^0-9]", "-", str(datetime.now().isoformat())
)
self.google_cloud_options.staging_location = "gs://" + all_options["bucket"] + "/staging"
self.google_cloud_options.temp_location = "gs://" + all_options["bucket"] + "/temp"
self.pg_config = relational_db.SourceConfiguration(
drivername="postgresql+pg8000",
host=all_options["db_host"],
port=all_options["db_port"],
username="postgres",
database="postgres",
password=all_options["db_password"],
)
self.table_config = relational_db.TableConfiguration(
name="users", primary_key_columns=["id"]
)
def run(self, pipeline):
"""runs whole beam job pipeline"""
users_bq = beam.io.BigQuerySource(
dataset=self.bigquery_dataset,
project=self.bigquery_project,
table=self.bigquery_input_table,
validate=True,
)
users_info = (
pipeline
| "read_users" >> beam.io.Read(users_bq)
| "map_by_user_id" >> beam.Map(lambda usr: {"id": usr["id"], "value": usr})
)
users_info | "Writing to DB table" >> relational_db.Write(
source_config=self.pg_config, table_config=self.table_config
)
if __name__ == "__main__":
logging.getLogger().setLevel(logging.INFO)
logging.info("starting BeamFeed process")
beam_job = BeamFeed()
with beam.Pipeline(options=beam_job.pipe_opt) as pipeline:
beam_job.run(pipeline)
和 UT 尝试
import pytest
import apache_beam
from apache_beam.testing.test_pipeline import TestPipeline
from apache_beam.testing.util import assert_that, equal_to
from apache_beam.transforms import Create, FlatMap
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.runners.direct.direct_runner import DirectRunner
from bla import beam, BeamFeed, BeamOptions
@pytest.fixture
def bg_mock(mocker):
# apache_beam.Create(
# [{"id": "8d73fda0-00b8-4084-bb8a-eea0f5bca46a", "foo": 1}]
# )
return mocker.patch("apache_beam.io.BigQuerySource")
def test_empty_profile_generation(bg_mock):
pipeline = TestPipeline(runner=DirectRunner())
beam_job = BeamFeed()
beam_job.run(pipeline)
pipeline.run()
我不确定测试 Dataflow 作业的最佳方法是什么。 只是集成测试而没有模拟? 给定的模拟 pytest 失败[1] 16845 abort
我分享了@de1 的观点,Apache Beam 有一些非常好的测试可以激励你自己的测试。
根据您要查找的内容,我认为 Apache Beam 的 SDK 代码中有几个文件可以帮助您设置单元测试。
此外,如果您想查看更多 BigQuery 测试,请查看此。
他们如何进行测试的快速概述:
我希望这个示例可以帮助您创建自己的作业和单元测试。
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.