
[英]org.apache.spark.sql.AnalysisException: Queries with streaming sources must be executed with writeStream.start() kafka
[英]org.apache.spark.sql.AnalysisException: Queries with streaming sources must be executed with writeStream.start();; FileSource[hdfs://localhost:9000..]
我正在尝试使用速率接收器流加入 csv 接收器流:
spark2.readStream
.format("csv")
.schema(schema)
.option("delimiter", ";")
.option("maxFilesPerTrigger", 1)
.csv("hdfs://"+hdfsLocation+"/generator/"+shortPath)
.persist(StorageLevel.MEMORY_ONLY_SER)
和
val rate = spark.readStream
.format("rate")
.option("rowsPerSecond", 1)
.option("numPartitions", 10)
.load()
和
val cvsStream2 = rate.as("counter").join(cvsStream.as("csv")).where("csv.id == mod(counter.value,"+divider +")").withWatermark("时间戳”、“1 秒”)
后来被称为
dataFrame.select(
lit(topicName) as "key",
concat(date_format(col("timestamp"),"yyyy-MM-DD'T'HH:mm:ss.SSS'Z'"), lit(" : "), lit(digitalTwinId), lit(" : "), lit(new ColumnName(columnName))) as "value")
.writeStream
.format("console")
.option("truncate", "false")
.queryName("kafkaDataGenerator" + columnName)
.start()
但我得到了例外
org.apache.spark.sql.AnalysisException: Queries with streaming sources must be executed with writeStream.start();;
FileSource[hdfs://localhost:9000/generator/csv5922743056983510526/kikai.csv]
at org.apache.spark.sql.catalyst.analysis.UnsupportedOperationChecker$.throwError(UnsupportedOperationChecker.scala:437)
at org.apache.spark.sql.catalyst.analysis.UnsupportedOperationChecker$.$anonfun$checkForBatch$1(UnsupportedOperationChecker.scala:37)
at org.apache.spark.sql.catalyst.analysis.UnsupportedOperationChecker$.$anonfun$checkForBatch$1$adapted(UnsupportedOperationChecker.scala:35)
at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:177)
at org.apache.spark.sql.catalyst.analysis.UnsupportedOperationChecker$.checkForBatch(UnsupportedOperationChecker.scala:35)
at org.apache.spark.sql.execution.QueryExecution.assertSupported(QueryExecution.scala:62)
at org.apache.spark.sql.execution.QueryExecution.$anonfun$withCachedData$1(QueryExecution.scala:73)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:764)
at org.apache.spark.sql.execution.QueryExecution.withCachedData$lzycompute(QueryExecution.scala:71)
at org.apache.spark.sql.execution.QueryExecution.withCachedData(QueryExecution.scala:71)
at org.apache.spark.sql.execution.QueryExecution.$anonfun$optimizedPlan$1(QueryExecution.scala:82)
at org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:111)
at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:133)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:764)
at org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:133)
at org.apache.spark.sql.execution.QueryExecution.optimizedPlan$lzycompute(QueryExecution.scala:82)
at org.apache.spark.sql.execution.QueryExecution.optimizedPlan(QueryExecution.scala:79)
at org.apache.spark.sql.execution.QueryExecution.assertOptimized(QueryExecution.scala:85)
at org.apache.spark.sql.execution.QueryExecution.executedPlan$lzycompute(QueryExecution.scala:103)
at org.apache.spark.sql.execution.QueryExecution.executedPlan(QueryExecution.scala:100)
at org.apache.spark.sql.execution.CacheManager.$anonfun$cacheQuery$2(CacheManager.scala:90)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:764)
at org.apache.spark.sql.execution.CacheManager.cacheQuery(CacheManager.scala:85)
at org.apache.spark.sql.Dataset.persist(Dataset.scala:3146)
at io.sekai.core.streaming.KafkaDataGenerator.readFromCSVFile(KafkaDataGenerator.scala:63)
at io.sekai.core.streaming.KafkaDataGenerator.readFromCSVFile$(KafkaDataGenerator.scala:21)
at io.sekai.core.streaming.KafkaDataGeneratorTest$$anon$1.readFromCSVFile(KafkaDataGeneratorTest.scala:19)
at io.sekai.core.streaming.KafkaDataGenerator.generateStream(KafkaDataGenerator.scala:79)
at io.sekai.core.streaming.KafkaDataGenerator.generateStream$(KafkaDataGenerator.scala:66)
at io.sekai.core.streaming.KafkaDataGeneratorTest$$anon$1.generateStream(KafkaDataGeneratorTest.scala:19)
at io.sekai.core.streaming.KafkaDataGenerator.generateStream(KafkaDataGenerator.scala:151)
at io.sekai.core.streaming.KafkaDataGenerator.generateStream$(KafkaDataGenerator.scala:138)
at io.sekai.core.streaming.KafkaDataGeneratorTest$$anon$1.generateStream(KafkaDataGeneratorTest.scala:19)
at io.sekai.core.streaming.KafkaDataGeneratorTest.$anonfun$new$1(KafkaDataGeneratorTest.scala:30)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at org.scalatest.OutcomeOf.outcomeOf(OutcomeOf.scala:85)
at org.scalatest.OutcomeOf.outcomeOf$(OutcomeOf.scala:83)
at org.scalatest.OutcomeOf$.outcomeOf(OutcomeOf.scala:104)
at org.scalatest.Transformer.apply(Transformer.scala:22)
at org.scalatest.Transformer.apply(Transformer.scala:20)
at org.scalatest.FlatSpecLike$$anon$1.apply(FlatSpecLike.scala:1682)
at org.scalatest.TestSuite.withFixture(TestSuite.scala:196)
at org.scalatest.TestSuite.withFixture$(TestSuite.scala:195)
at io.sekai.core.streaming.KafkaDataGeneratorTest.org$scalamock$scalatest$AbstractMockFactory$$super$withFixture(KafkaDataGeneratorTest.scala:12)
at org.scalamock.scalatest.AbstractMockFactory.$anonfun$withFixture$1(AbstractMockFactory.scala:35)
at org.scalamock.MockFactoryBase.withExpectations(MockFactoryBase.scala:51)
at org.scalamock.MockFactoryBase.withExpectations$(MockFactoryBase.scala:43)
at io.sekai.core.streaming.KafkaDataGeneratorTest.withExpectations(KafkaDataGeneratorTest.scala:12)
at org.scalamock.scalatest.AbstractMockFactory.withFixture(AbstractMockFactory.scala:34)
at org.scalamock.scalatest.AbstractMockFactory.withFixture$(AbstractMockFactory.scala:31)
at io.sekai.core.streaming.KafkaDataGeneratorTest.withFixture(KafkaDataGeneratorTest.scala:12)
at org.scalatest.FlatSpecLike.invokeWithFixture$1(FlatSpecLike.scala:1680)
at org.scalatest.FlatSpecLike.$anonfun$runTest$1(FlatSpecLike.scala:1692)
at org.scalatest.SuperEngine.runTestImpl(Engine.scala:289)
at org.scalatest.FlatSpecLike.runTest(FlatSpecLike.scala:1692)
at org.scalatest.FlatSpecLike.runTest$(FlatSpecLike.scala:1674)
at org.scalatest.FlatSpec.runTest(FlatSpec.scala:1685)
at org.scalatest.FlatSpecLike.$anonfun$runTests$1(FlatSpecLike.scala:1750)
at org.scalatest.SuperEngine.$anonfun$runTestsInBranch$1(Engine.scala:396)
at scala.collection.immutable.List.foreach(List.scala:392)
at org.scalatest.SuperEngine.traverseSubNodes$1(Engine.scala:384)
at org.scalatest.SuperEngine.runTestsInBranch(Engine.scala:373)
at org.scalatest.SuperEngine.$anonfun$runTestsInBranch$1(Engine.scala:410)
at scala.collection.immutable.List.foreach(List.scala:392)
at org.scalatest.SuperEngine.traverseSubNodes$1(Engine.scala:384)
at org.scalatest.SuperEngine.runTestsInBranch(Engine.scala:379)
at org.scalatest.SuperEngine.runTestsImpl(Engine.scala:461)
at org.scalatest.FlatSpecLike.runTests(FlatSpecLike.scala:1750)
at org.scalatest.FlatSpecLike.runTests$(FlatSpecLike.scala:1749)
at org.scalatest.FlatSpec.runTests(FlatSpec.scala:1685)
at org.scalatest.Suite.run(Suite.scala:1147)
at org.scalatest.Suite.run$(Suite.scala:1129)
at org.scalatest.FlatSpec.org$scalatest$FlatSpecLike$$super$run(FlatSpec.scala:1685)
at org.scalatest.FlatSpecLike.$anonfun$run$1(FlatSpecLike.scala:1795)
at org.scalatest.SuperEngine.runImpl(Engine.scala:521)
at org.scalatest.FlatSpecLike.run(FlatSpecLike.scala:1795)
at org.scalatest.FlatSpecLike.run$(FlatSpecLike.scala:1793)
at io.sekai.core.streaming.KafkaDataGeneratorTest.org$scalatest$BeforeAndAfterAll$$super$run(KafkaDataGeneratorTest.scala:12)
at org.scalatest.BeforeAndAfterAll.liftedTree1$1(BeforeAndAfterAll.scala:213)
at org.scalatest.BeforeAndAfterAll.run(BeforeAndAfterAll.scala:210)
at org.scalatest.BeforeAndAfterAll.run$(BeforeAndAfterAll.scala:208)
at io.sekai.core.streaming.KafkaDataGeneratorTest.run(KafkaDataGeneratorTest.scala:12)
at org.scalatest.junit.JUnitRunner.run(JUnitRunner.scala:99)
at org.junit.runner.JUnitCore.run(JUnitCore.java:137)
at com.intellij.junit4.JUnit4IdeaTestRunner.startRunnerWithArgs(JUnit4IdeaTestRunner.java:68)
at com.intellij.rt.junit.IdeaTestRunner$Repeater.startRunnerWithArgs(IdeaTestRunner.java:33)
at com.intellij.rt.junit.JUnitStarter.prepareStreamsAndStart(JUnitStarter.java:230)
at com.intellij.rt.junit.JUnitStarter.main(JUnitStarter.java:58)
如果我从流数据帧切换回批处理数据帧,一切正常:
spark2.read
.format("csv")
.schema(schema)
.option("delimiter", ";")
.option("maxFilesPerTrigger", 1)
.csv("hdfs://"+hdfsLocation+"/generator/"+shortPath)
.persist(StorageLevel.MEMORY_ONLY_SER)
我不清楚, Queries with streaming sources must be executed with writeStream.start();
哪种Queries with streaming sources must be executed with writeStream.start();
的Queries with streaming sources must be executed with writeStream.start();
等待,如果 1) 流数据帧是一个 readStream 并且不能提供 writeStream.start 给它; 2) 批处理数据帧也不能提供 writeStream.start()。 csv 流到底出了什么问题?
不幸的是,我永远无法打印执行的物理计划,因为异常发生在为 csv 流创建 DataFrame 之前。
如果我删除persistanc部分,它的性能非常差:
spark2.readStream
.format("csv")
//.option("inferSchema", "true") //is it working?
//.option("header", "true") //is it working?
.schema(schema)
.option("delimiter", ";")
.option("maxFilesPerTrigger", 1)
.csv("hdfs://"+hdfsLocation+"/generator/"+shortPath)
// .persist(StorageLevel.MEMORY_ONLY_SER)
为什么persist 无法作为流数据帧运行,而没有persist 则只能作为批处理数据帧运行?
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.