![](/img/trans.png)
[英]How to read such a nested multiline json file into a data frame with Spark/Scala
[英]How to rename spark data frame output file in AWS in spark SCALA
我將我的火花數據幀輸出保存為帶分區的scala中的csv文件。 這就是我在Zeppelin中的表現 。
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._
import org.apache.spark.{ SparkConf, SparkContext }
import java.sql.{Date, Timestamp}
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.functions.input_file_name
import org.apache.spark.sql.functions.regexp_extract
val get_cus_val = spark.udf.register("get_cus_val", (filePath: String) => filePath.split("\\.")(3))
val rdd = sc.textFile("s3://trfsmallfffile/FinancialLineItem/MAIN")
val header = rdd.filter(_.contains("LineItem.organizationId")).map(line => line.split("\\|\\^\\|")).first()
val schema = StructType(header.map(cols => StructField(cols.replace(".", "_"), StringType)).toSeq)
val data = sqlContext.createDataFrame(rdd.filter(!_.contains("LineItem.organizationId")).map(line => Row.fromSeq(line.split("\\|\\^\\|").toSeq)), schema)
val schemaHeader = StructType(header.map(cols => StructField(cols.replace(".", "."), StringType)).toSeq)
val dataHeader = sqlContext.createDataFrame(rdd.filter(!_.contains("LineItem.organizationId")).map(line => Row.fromSeq(line.split("\\|\\^\\|").toSeq)), schemaHeader)
val df1resultFinal=data.withColumn("DataPartition", get_cus_val(input_file_name))
val rdd1 = sc.textFile("s3://trfsmallfffile/FinancialLineItem/INCR")
val header1 = rdd1.filter(_.contains("LineItem.organizationId")).map(line => line.split("\\|\\^\\|")).first()
val schema1 = StructType(header1.map(cols => StructField(cols.replace(".", "_"), StringType)).toSeq)
val data1 = sqlContext.createDataFrame(rdd1.filter(!_.contains("LineItem.organizationId")).map(line => Row.fromSeq(line.split("\\|\\^\\|").toSeq)), schema1)
import org.apache.spark.sql.expressions._
val windowSpec = Window.partitionBy("LineItem_organizationId", "LineItem_lineItemId").orderBy($"TimeStamp".cast(LongType).desc)
val latestForEachKey = data1.withColumn("rank", rank().over(windowSpec)).filter($"rank" === 1).drop("rank", "TimeStamp")
val dfMainOutput = df1resultFinal.join(latestForEachKey, Seq("LineItem_organizationId", "LineItem_lineItemId"), "outer")
.select($"LineItem_organizationId", $"LineItem_lineItemId",
when($"DataPartition_1".isNotNull, $"DataPartition_1").otherwise($"DataPartition").as("DataPartition"),
when($"StatementTypeCode_1".isNotNull, $"StatementTypeCode_1").otherwise($"StatementTypeCode").as("StatementTypeCode"),
when($"FinancialConceptLocalId_1".isNotNull, $"FinancialConceptLocalId_1").otherwise($"FinancialConceptLocalId").as("FinancialConceptLocalId"),
when($"FinancialConceptGlobalId_1".isNotNull, $"FinancialConceptGlobalId_1").otherwise($"FinancialConceptGlobalId").as("FinancialConceptGlobalId"),
when($"FinancialConceptCodeGlobalSecondaryId_1".isNotNull, $"FinancialConceptCodeGlobalSecondaryId_1").otherwise($"FinancialConceptCodeGlobalSecondaryId").as("FinancialConceptCodeGlobalSecondaryId"),
when($"FFAction_1".isNotNull, $"FFAction_1").otherwise($"FFAction|!|").as("FFAction|!|"))
.filter(!$"FFAction|!|".contains("D|!|"))
val dfMainOutputFinal = dfMainOutput.na.fill("").select($"DataPartition",$"StatementTypeCode",concat_ws("|^|", dfMainOutput.schema.fieldNames.filter(_ != "DataPartition").map(c => col(c)): _*).as("concatenated"))
val headerColumn = dataHeader.columns.toSeq
val header = headerColumn.mkString("", "|^|", "|!|").dropRight(3)
val dfMainOutputFinalWithoutNull = dfMainOutputFinal.withColumn("concatenated", regexp_replace(col("concatenated"), "|^|null", "")).withColumnRenamed("concatenated", header)
dfMainOutputFinalWithoutNull.repartition(1).write.partitionBy("DataPartition","StatementTypeCode")
.format("csv")
.option("nullValue", "")
.option("delimiter", "\t")
.option("quote", "\u0000")
.option("header", "true")
.option("codec", "gzip")
.save("s3://trfsmallfffile/FinancialLineItem/output")
val FFRowCount =dfMainOutputFinalWithoutNull.groupBy("DataPartition","StatementTypeCode").count
FFRowCount.coalesce(1).write.format("com.databricks.spark.xml")
.option("rootTag", "FFFileType")
.option("rowTag", "FFPhysicalFile")
.save("s3://trfsmallfffile/FinancialLineItem/Descr")
現在文件保存在預期的分區文件夾結構中。
現在我的要求是重命名所有零件文件並將其保存在一個目錄中。 該文件的名稱將作為文件夾結構的名稱。
例如,我有一個文件保存在folder/DataPartition=Japan/PartitionYear=1971/part-00001-87a61115-92c9-4926-a803-b46315e55a08.c000.csv.gz
現在我想要我的文件名
Japan.1971.1.txt.gz
Japan.1971.2.txt.gz
我的工作完成之后,我已經在java map-reduce中完成了這個,然后我正在閱讀HDFS文件系統,然后將其移動到不同的位置作為重命名的文件名。
但是如何在Spark SCALA中的AWS S3文件系統中實現這一點。
據我所知,沒有直接的方法來重命名火花數據幀輸出文件名。
但是有一個實現可以在作業本身使用MultipleOutputs
作為saveAsHadoopFile但是如何做到這一點?
我在scala中尋找一些示例代碼
就像完成工作后我們需要從s3讀取文件,重新命名並將其移動到其他位置。
val tempOutPath = "mediamath.dir"
headerDf.union(outDf)
.repartition(1)
.write
.mode(SaveMode.Overwrite)
.format("text")
.option("codec", "gzip")
.save(tempOutPath)
import org.apache.hadoop.fs._
val sc = spark.sparkContext
val fs = FileSystem.get(sc.hadoopConfiguration)
val file = fs.globStatus(new Path("mediamath.dir/part*.gz"))(0).getPath.getName
fs.rename(new Path("mediamath.dir/" + file), new Path(<aws-s3-path>))
這是我的代碼片段,請看看這是否對您有所幫助。
AFAIK,如果你想直接在S3存儲桶中重命名文件/對象, 那是不可能的。
您可以實現
rename = copy to target + delete source
def prepareNewFilename(oldFilename: String) = {
val pattern = raw".*/DataPartition=%s/PartitionYear=%s/part-%s.*\.%s"
.format("([A-Za-z]+)", "([0-9]+)", "([0-9]+)", "([a-z]+)")
.r
val pattern(country, year, part, extn) = oldFilename
"%s.%s.%s.%s.%s".format(country, year, part, "txt", extn)
}
val oldFilename = "folder/DataPartition=Japan/PartitionYear=1971/part-00001-87a61115-92c9-4926-a803-b46315e55a08.c000.csv.gz"
val newFilename = prepareNewFilename(oldFilename)
//newFilename: String = Japan.1971.00001.txt.gz
import com.amazonaws.AmazonServiceException
import com.amazonaws.services.s3.AmazonS3ClientBuilder
val s3 = AmazonS3ClientBuilder.defaultClient()
try {
s3.copyObject(sourceBkt, oldFilename, targetBkt, newFilename)
s3.deleteObject(sourceBkt, oldFilename)
} catch {
case e: AmazonServiceException =>
System.err.println(e.getErrorMessage)
System.exit(1)
}
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.