[英]Spark Job is not posting message to Kafka topic
我已经编写了 spark 作业来读取一个文件,将数据转换为 json 并将数据发布到 Kafka:我尝试了所有选项,例如 1.Putting thread.sleep 2.changing linger.ms lesser than thread.sleep.But nothing is working out ..它只是不向 kafKa 发布任何内容。我已经尝试过 producer.flush()/producer.close()。日志中没有错误。但它仍然没有发布任何内容。 如果我写一个普通的独立生产者来将消息发布到同一个 kafka 主题,那将没有任何问题。 因此,卡夫卡本身没有问题。 4.我可以看到我的发送方法正在从日志中调用。最后关闭也被调用了。没有错误。 请帮忙!!!!!!!!!!!!
这是我的项目的重要文件:
build.sbt:
名称:=“SparkStreamingExample”
//版本:=“0.1”
scalaVersion := "2.11.8"
val spark="2.3.1"
val kafka="0.10.1"
// https://mvnrepository.com/artifact/org.apache.kafka/kafka
dependencyOverrides += "com.fasterxml.jackson.core" % "jackson-core" % "2.9.6"
dependencyOverrides += "com.fasterxml.jackson.core" % "jackson-databind" % "2.9.6"
dependencyOverrides += "com.fasterxml.jackson.module" %% "jackson-module-scala" % "2.9.6"
// https://mvnrepository.com/artifact/com.fasterxml.jackson.dataformat/jackson-dataformat-cbor
dependencyOverrides += "com.fasterxml.jackson.dataformat" % "jackson-dataformat-cbor" % "2.9.6"
libraryDependencies += "org.apache.kafka" % "kafka_2.11" % "2.0.0"
// https://mvnrepository.com/artifact/org.apache.kafka/kafka
libraryDependencies += "org.apache.spark" % "spark-streaming_2.11" % spark
libraryDependencies += "org.apache.spark" %% "spark-core" % "2.3.1"
libraryDependencies +="com.typesafe.play" %"play-json_2.11" % "2.6.6" exclude("com.fasterxml.jackson.core","jackson-databind")
libraryDependencies += "org.apache.spark" %% "spark-sql" % "2.3.1"
libraryDependencies +="com.typesafe" % "config" %"1.3.2"
MySparkKafkaProducer.scala
import java.util.Properties
import java.util.concurrent.Future
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord, RecordMetadata}
class MySparkKafkaProducer(createProducer: () => KafkaProducer[String, String]) extends Serializable {
/* This is the key idea that allows us to work around running into
NotSerializableExceptions. */
@transient lazy val producer = createProducer()
def send(topic: String, key: String, value: String): Future[RecordMetadata] = {
println("inside send method")
producer.send(new ProducerRecord(topic, key, value))
}
def send(topic: String, value: String)= {
// println("inside send method")
producer.send(new ProducerRecord(topic, value))
}
// producer.send(new ProducerRecord[K, V](topic, value))
}
object MySparkKafkaProducer extends Serializable {
import scala.collection.JavaConversions._
def apply(config:Properties):MySparkKafkaProducer={
val f = () =>{
val producer =new KafkaProducer[String,String](config)
sys.addShutdownHook({
println("calling Closeeeeeeeeeee")
producer.flush()
producer.close
})
producer
}
new MySparkKafkaProducer(f)
}
}
AlibababaMainJob.scala
import java.util.Properties
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.SparkSession
import com.typesafe.config.ConfigFactory
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.codehaus.jackson.map.ser.std.StringSerializer
object AlibababaMainJob {
def main(args:Array[String]) {
val ss = SparkSession.builder().master("local[*]").appName("AlibabaCoreJob").getOrCreate()
val conf = new SparkConf().setMaster("local[2]").setAppName("AlibabaCoreJob")
//val ssc = new StreamingContext(conf, Seconds(1))
// val ssc= new StreamingContext(getSparkConf(),6)
val coreJob = new AlibabaCoreJob()
val configuration = Configuration.apply(ConfigFactory.load.resolve)
implicit val rollUpProducer: Broadcast[MySparkKafkaProducer] = ss.sparkContext.broadcast(MySparkKafkaProducer(producerProps(configuration)))
println(s"==========Kafka Config======${configuration.kafka}")
coreJob.processRecordFromFile(ss, rollUpProducer)
Thread.sleep(1000)
//ssc.start()
// println(s"==========Spark context Sarted ]======${ssc.sparkContext.appName}")
/// ssc.awaitTermination()
//
//val ss = SparkSession.builder().master("local[*]").appName("AlibabaCoreJob").getOrCreate()
//Set Up kakfa Configuration:https://stackoverflow.com/questions/31590592/spark-streaming-read-and-write-on-kafka-topic
}
def producerProps(jobConfig:Configuration,extras:(String,String)*):Properties={
val p =new Properties()
p.put("bootstrap.servers",jobConfig.kafka.brokerList)
p.put("key.serializer","org.apache.kafka.common.serialization.StringSerializer")
p.put("value.serializer","org.apache.kafka.common.serialization.StringSerializer")
p.put("acks","all")
p.put("retries","3")
p.put("linger.ms", "1")
p
}
// coreJob.processRecordFromFile(ss,rollUpProducer)
//}
}
AlibabaCoreJob.scala
import java.util.Properties
import org.apache.kafka.clients.producer.ProducerRecord
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Row, SparkSession}
import play.api.libs.json._
import org.apache.kafka.clients.producer.ProducerConfig
class AlibabaCoreJob extends Serializable {
// implicit val transctionWrites = Json.writes[Transction]
//case class Transction(productCode:String,description:String,brand:String,category:String,unitPrice:String,ordItems:String,mode:String) extends Serializable
def processRecordFromFile(ss:SparkSession,kafkaProducer:Broadcast[MySparkKafkaProducer]):Unit={
println("Entering processRecordFromFile")
val rddFromFile = ss.sparkContext.textFile("src/main/resources/12_transactions_case_study.csv")
println("Entering loaded file")
val fileAfterHeader=rddFromFile.mapPartitionsWithIndex(
(idx,iterator)=>if(idx==0)
iterator.drop(0) else iterator)
println("Removed header")
processRdd(fileAfterHeader,kafkaProducer:Broadcast[MySparkKafkaProducer])
}
//Set Up kakfa Configuration:https://stackoverflow.com/questions/31590592/spark-streaming-read-and-write-on-kafka-topic
def processRdd(fileAfterHeader: RDD[String],kafkaProducer:Broadcast[MySparkKafkaProducer]) = {
println("Entering processRdd")
val rddList = fileAfterHeader.mapPartitions(
line => {
// println("lineeeeee>>>"+line)
line.map(x => x.split(",")).map(y => Transction(y(0), y(1), y(2), y(3), y(4), y(5), y(6))).toList.toIterator
})
rddList.foreach(lineitem=>{
// println("Entering foreach>>>>")
val jsonString:String=Json.stringify(Json.toJson(lineitem))
//val jsonString:String=lineitem.A
// println("calling kafka producer")
kafkaProducer.value.send("topic",jsonString)
// println("done calling kafka producer")
})
}
//Suppose you want to drop 1s 3 lines from file
// val da = fi.mapPartitionsWithIndex{ (id_x, iter) => if (id_x == 0) iter.drop(3) else iter }
//Create RowRDD by mapping each line to the required fields
// val rowRdd = da.map(x=>Row(x(0), x(1)))
//Map Partitions:
}
在您的项目中,添加以下依赖项:Spark-Sql、Spark-Core、Spark-Streaming、Spa-Streaming-Kafka-0-10。 您可以读取 dataframe 中的给定文件,执行您想要的任何类型的处理,然后在处理完成后,您可以将 dataframe 写入 kafka,如下所示
resultDF.writeStream.format("kafka")
.option("kafka.bootstrap.servers", "host1:port1,host2:port2")
.option("topic", "topic1")
您在此处参考文档以获得进一步的清晰度。
请注意,我假设您的处理结果将存储在名为resultDF的 Dataframe 中
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.