I am trying to create an external file in hive metastore, using apache hudi framework. Its able to connect with hive metastore but throws exception after the connection when trying to create table.
dataFrame.writeStream
.format("org.apache.hudi")
.option(HoodieWriteConfig.TABLE_NAME, tableName)
.option(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY,tableName)
.option(DataSourceWriteOptions.OPERATION_OPT_KEY, DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL)
.option(DataSourceWriteOptions.HIVE_SYNC_ENABLED_OPT_KEY, "true")
.option(DataSourceWriteOptions.HIVE_AUTO_CREATE_DATABASE_OPT_KEY, "true")
.option(DataSourceWriteOptions.DEFAULT_HIVE_ASSUME_DATE_PARTITION_OPT_VAL, "false")
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY, "partition_id")
.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY, "partition_id")
.option(DataSourceWriteOptions.HIVE_URL_OPT_KEY, "jdbc:hive2://localhost:10000")
.option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY, key)
.option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY, combineKey)
.option(DataSourceWriteOptions.OPERATION_OPT_KEY, DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL)
.option("checkpointLocation", "/tmp/test/checkpoint")
.option("spark.kryo.registrationRequired", "true")
.option("hoodie.upsert.shuffle.parallelism", "1")
.outputMode("append")
.start("s3a://testbucket/test")
dependencies:
scalaVersion := "2.12.1"
libraryDependencies += "org.apache.spark" %% "spark-core" % "3.1.1"
libraryDependencies += "org.apache.spark" %% "spark-sql" % "3.1.1"
libraryDependencies += "org.apache.spark" %% "spark-sql-kafka-0-10" % "3.1.1"
libraryDependencies += "org.apache.spark" %% "spark-streaming" % "3.1.1" % "provided"
libraryDependencies += "org.apache.spark" %% "spark-streaming-kafka-0-10" % "3.1.1"
libraryDependencies += "org.apache.hudi" %% "hudi-spark-bundle" % "0.7.0"
libraryDependencies += "org.apache.hadoop" % "hadoop-common" % "3.1.4"
libraryDependencies += "org.apache.hadoop" % "hadoop-hdfs" % "3.1.1"
libraryDependencies += "org.apache.hadoop" % "hadoop-client" % "3.1.1"
libraryDependencies += "org.apache.hadoop" % "hadoop-aws" % "3.1.1"
libraryDependencies += "org.apache.spark" %% "spark-hive" % "3.1.1"
libraryDependencies += "org.apache.hive" % "hive-jdbc" % "3.1.1"
libraryDependencies += "org.apache.hive" % "hive-metastore" % "3.1.1"
libraryDependencies += "org.apache.hive" % "hive-exec" % "3.1.1"
dependencyOverrides += "org.apache.hadoop" % "hadoop-common" % "3.1.1"
dependencyOverrides += "org.apache.commons" % "commons-lang3" % "3.9"
got following exception:
org.apache.hudi.hive.HoodieHiveSyncException: Failed in executing SQL CREATE EXTERNAL TABLE IF NOT EXISTS `default`.......' org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' STORED AS INPUTFORMAT 'org.apache.hudi.hadoop.HoodieParquetInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' LOCATION 's3a://testbucket/test'
Caused by: org.apache.hive.service.cli.HiveSQLException: Error running query: java.lang.NoClassDefFoundError: org/apache/hadoop/fs/StreamCapabilities
at org.apache.hive.jdbc.Utils.verifySuccess(Utils.java:300) ~[hive-jdbc-3.1.1.jar:3.1.1]
at org.apache.hive.jdbc.Utils.verifySuccessWithInfo(Utils.java:286) ~[hive-jdbc-3.1.1.jar:3.1.1]
at org.apache.hive.jdbc.HiveStatement.runAsyncOnServer(HiveStatement.java:324) ~[hive-jdbc-3.1.1.jar:3.1.1]
at org.apache.hive.jdbc.HiveStatement.execute(HiveStatement.java:265) ~[hive-jdbc-3.1.1.jar:3.1.1]
at org.apache.hudi.hive.HoodieHiveClient.updateHiveSQL(HoodieHiveClient.java:367) ~[hudi-spark-bundle_2.12-0.7.0.jar:0.7.0]
... 37 more
seems like there is a jar version mismatch? you would open a hudi github issue to get timely response from the community.
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.