簡體   English   中英

類型不匹配錯誤。 在 Scala Spark 中找到 Array[String] 需要 Seq[?]

[英]Type mismatch error. found Array[String] requires Seq[?] in Scala Spark

下面是我看到的代碼和構建錯誤。 你能告訴我如何解決這個錯誤。 這是完整的代碼。 網址被省略。 使用的 Spark 版本是 1.6.0 使用的 Scala 版本是 2.10.5

代碼

import java.net.{HttpURLConnection, URL}
import org.slf4j.LoggerFactory
import org.apache.spark.sql.{DataFrame, Row, SQLContext}
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}
import java.util.Arrays
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.expressions._
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.StructType._
import scala.Predef.exceptionWrapper

對象 CoExtract {

    private val logger = LoggerFactory.getLogger(getClass)
    def main(args: Array[String]) {

    val sparkConf = new SparkConf()
    val sc = new SparkContext(sparkConf)
    val sqlcontext = new SQLContext(sc)
//  val sqlcontext = new HiveContext(sc)

    import sqlcontext.implicits._

    val obj = new Connection
    val string_response=obj.getCsvResponse(obj.getConnection(new URL("https://")))

    val array_response = string_response.split("\n")
    logger.info("The length of the array is "+ array_response.length)
    val rdd_response=sc.parallelize(array_response.toSeq)

    logger.info("The count of elements in the rdd are "+rdd_response.count())

val header = rdd_response.first()
logger.info("header"+header)

val noheaderRDD = rdd_response.filter(_ != header)
logger.info("NoheaderRDD is"+noheaderRDD.first())

val subsetRdd=noheaderRDD.map( x => (Row(
  x.split(",(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)",-1)(0),
  x.split(",(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)",-1)(1),
  x.split(",")(2)

)
  )
)

val x =subsetRdd.zipWithIndex().collect()

val schema = new StructType()
  .add(StructField("Email",StringType, true))
  .add(StructField("Recipient",StringType, true))
  .add(StructField("rowid", LongType, false))


val rdd_to_df = sqlcontext.createDataFrame(subsetRdd,schema)
val df_to_rdd_again = rdd_to_df.rdd.zipWithIndex

rdd_to_df.withColumn("rowid", row_number.over(Window.partitionBy(lit(1)).orderBy(lit(1))))
val final_df = sqlcontext.createDataFrame(df_to_rdd_again.map{case (row,index) => Row.fromSeq(row.toSeq ++ Seq(index))}, schema )

val start: Int = 0
val end: Int = rdd_to_df.count().toInt
var counter:Int = start



 logger.info("Final Count" + end)
    logger.info("The schema of the dataframe is "+rdd_to_df.printSchema())
final_df.show(100,false)

logger.info("Schema of rdd to df" + rdd_to_df.printSchema())
logger.info("schema of final_df" + final_df.printSchema())

val df_response = sqlcontext.read.format("com.databricks.spark.csv").option("header", "true").load("hdfs:///")

logger.info("數據幀的模式為"+df_response) logger.info("數據幀的計數為"+df_response.count())

} }

構建錯誤

scala:48: error: type mismatch;
[ERROR]  found   : Array[String]
[ERROR]  required: Seq[?]
[ERROR] Error occurred in an application involving default arguments.
[INFO]     val rdd_response=sc.parallelize(array_response)
[INFO]                                     ^

只需將Array轉換為Seq

val rdd_response=sc.parallelize(array_response.toSeq)

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM