apache spark streaming with kafka and hive integration

Question

Is there anyway to integrate apache spark structured streaming with apache hive and apache kafka in one application

after adding list using collectAsList and storing it into list. I got the below error.

Can anyone help me to resolve this issues.

Thanks in Advance.

import org.apache.spark.api.java.function.MapFunction;

import java.io.IOException;
import java.util.Arrays;
import java.util.List;

import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.streaming.StreamingQuery;
import org.apache.spark.sql.streaming.StreamingQueryException;


public class DatasetKafka {
    public static void main(String[] args) throws IOException {
        SparkSession spark = SparkSession
                .builder()
                .appName("Java Spark Hive Example").master("yarn")
                .config("spark.sql.warehouse.dir", "hdfs://localhost:54310/user/hive/warehouse")
                .enableHiveSupport()
                .getOrCreate();
        Logger.getRootLogger().setLevel(Level.ERROR);
        Dataset<String> lines = spark
                  .readStream()
                  .format("kafka")
                  .option("kafka.bootstrap.servers", "localhost:9092")
                  .option("subscribe", "test")
                  .load().selectExpr("CAST(value AS STRING)")
                  .as(Encoders.STRING());
        List<String> line=lines.collectAsList();
        for(String li:line) {
            String values[]=li.split(",");
            String query="insert into table match values("+Integer.parseInt(
            values[0])+
            ","+values[1]+
            ","+Integer.parseInt(values[2])+
            ","+Integer.parseInt(values[3])+
            ","+Integer.parseInt(values[4])+
            ","+values[5]+
            ","+Integer.parseInt(values[6])+
            ","+values[7]+
            ","+Integer.parseInt(values[8])+
            ","+Integer.parseInt(values[9])+
            ","+Integer.parseInt(values[10])+
            ","+values[11]+
            ","+Integer.parseInt(values[12])+
            ","+Integer.parseInt(values[13])+
            ","+Integer.parseInt(values[14])+
            ","+Integer.parseInt(values[15])+
            ","+Integer.parseInt(values[16])+
            ","+values[17]+
            ","+values[18]+")";
            spark.sql(query);
        }

//      List<String> values=ll.collectAsList();
        Dataset<String> words=lines.map((MapFunction<String, String>)k->{
            return k;
        }, Encoders.STRING());
        Dataset<Row> wordCounts = words.flatMap(
                (FlatMapFunction<String, String>) x -> Arrays.asList(x.split(",")).iterator(),
                Encoders.STRING()).groupBy("value").count();
        StreamingQuery query = wordCounts.writeStream()
                  .outputMode("complete")
                  .format("console")
                  .start();
                try {
                    query.awaitTermination();
                } catch (StreamingQueryException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
    }

Exception in thread "main" org.apache.spark.sql.AnalysisException: Queries with streaming sources must be executed with writeStream.start();;
kafka
    at org.apache.spark.sql.catalyst.analysis.UnsupportedOperationChecker$.org$apache$spark$sql$catalyst$analysis$UnsupportedOperationChecker$$throwError(UnsupportedOperationChecker.scala:389)
    at org.apache.spark.sql.catalyst.analysis.UnsupportedOperationChecker$$anonfun$checkForBatch$1.apply(UnsupportedOperationChecker.scala:38)
    at org.apache.spark.sql.catalyst.analysis.UnsupportedOperationChecker$$anonfun$checkForBatch$1.apply(UnsupportedOperationChecker.scala:36)
    at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:127)
    at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:126)
    at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:126)
    at scala.collection.immutable.List.foreach(List.scala:392)
    at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:126)
    at org.apache.spark.sql.catalyst.analysis.UnsupportedOperationChecker$.checkForBatch(UnsupportedOperationChecker.scala:36)
    at org.apache.spark.sql.execution.QueryExecution.assertSupported(QueryExecution.scala:51)
    at org.apache.spark.sql.execution.QueryExecution.withCachedData$lzycompute(QueryExecution.scala:62)
    at org.apache.spark.sql.execution.QueryExecution.withCachedData(QueryExecution.scala:60)
    at org.apache.spark.sql.execution.QueryExecution.optimizedPlan$lzycompute(QueryExecution.scala:66)
    at org.apache.spark.sql.execution.QueryExecution.optimizedPlan(QueryExecution.scala:66)
    at org.apache.spark.sql.execution.QueryExecution.sparkPlan$lzycompute(QueryExecution.scala:72)
    at org.apache.spark.sql.execution.QueryExecution.sparkPlan(QueryExecution.scala:68)
    at org.apache.spark.sql.execution.QueryExecution.executedPlan$lzycompute(QueryExecution.scala:77)
    at org.apache.spark.sql.execution.QueryExecution.executedPlan(QueryExecution.scala:77)
    at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3360)
    at org.apache.spark.sql.Dataset.collectAsList(Dataset.scala:2794)
    at com.ges.kafka.DatasetKafka.main(DatasetKafka.java:48)
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)

Answer 1

You could use the libraries

spark-sql-kafka to read data from Kafka, see Spark Structured Streaming + Kafka Integration Guide and
spark-llap to write data to Hive, see https://github.com/hortonworks-spark/spark-llap/

Both libraries are available on Maven.

A simple example of a Spark Structured Stream application is shown below. Make sure to create the Hive table in advance.

  val ds = spark.readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", config.getString("broker.list"))
    .option("kafka.security.protocol", config.getString("security.protocol"))
    .option("subscribe", config.getString("kafka.topic.in"))
    .option("startingOffsets", config.getString("kafka.starting.offset"))
    .option("failOnDataLoss", "false")
    .load()
    .selectExpr("CAST(key AS STRING) AS key", "CAST(value AS STRING)")


  val query = ds.writeStream
    .format(HiveWarehouseSession.STREAM_TO_STREAM)
    .option("database", "my_database")
    .option("table", "my_table")
    .option("metastoreUri", spark.conf.get("spark.datasource.hive.warehouse.metastoreUri"))
    .option("checkpointLocation", config.getString("spark.checkpoint.dir"))
    .trigger(Trigger.ProcessingTime(config.getLong("spark.batchWindowSizeSecs").seconds))
    .start()

  query.awaitTermination()

apache spark streaming with kafka and hive integration

Question

1 answers

solution1
1 2020-11-23 05:22:04

apache spark streaming with kafka and hive integration

Question

1 answers

solution1 1 2020-11-23 05:22:04

solution1
1 2020-11-23 05:22:04