简体   繁体   中英

Append new rows to a Spark dataframe based on a condition

I need some help on resolving this tricky transformation-

My spark dataframe look like this:

+---+---+--------+---------+-------+--------+---------+
|  A|  B|rcvr_nbr|order_qty|rcv_qty|item_nbr|edit_flag|
+---+---+--------+---------+-------+--------+---------+
|123|  1|     500|       10|      2|    1001|    false|
|123|  1|     501|       10|      2|    1001|    false|
|123|  4|     502|       60|      5|    1001|     true|
|123|  2|     504|       40|     30|    1003|    false|
|123|  5|     510|       10|     10|    1003|     true|
+---+---+--------+---------+-------+--------+---------+

Where I need to add new records(rows) for a combination of A and B (having edit_flag as false) if the item_nbr matches with another A and B having edit_flag as true .

The new row will have everything columns copied from its parent row except rcv_qty and rcvr_nbr. So, final output will look like:

+---+---+--------+---------+-------+--------+---------+
|  A|  B|rcvr_nbr|order_qty|rcv_qty|item_nbr|edit_flag|
+---+---+--------+---------+-------+--------+---------+
|123|  1|     500|       10|      2|    1001|    false|
|123|  1|     501|       10|      2|    1001|    false|
|123|  1|     502|       10|      5|    1001|    false|
|123|  4|     502|       60|      5|    1001|     true|
|123|  2|     504|       40|     30|    1003|    false|
|123|  2|     510|       40|     10|    1003|    false|
|123|  5|     510|       10|     10|    1003|     true|
+---+---+--------+---------+-------+--------+---------+
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._

import spark.implicits._

case class Source(
    A: Int,
    B: Int,
    rcvr_nbr: Int,
    order_qty: Int,
    rcv_qty: Int,
    item_nbr: Int,
    edit_flag: Boolean
)

val sourceDF = Seq(
  Source(123, 1, 500, 10, 2, 1001, false),
  Source(123, 1, 501, 10, 2, 1001, false),
  Source(123, 4, 502, 60, 5, 1001, true),
  Source(123, 2, 504, 40, 30, 1003, false),
  Source(123, 5, 510, 10, 10, 1003, true)
).toDF()

sourceDF.printSchema()
//    root
//    |-- A: integer (nullable = false)
//    |-- B: integer (nullable = false)
//    |-- rcvr_nbr: integer (nullable = false)
//    |-- order_qty: integer (nullable = false)
//    |-- rcv_qty: integer (nullable = false)
//    |-- item_nbr: integer (nullable = false)
//    |-- edit_flag: boolean (nullable = false)

sourceDF.show(false)
//    +---+---+--------+---------+-------+--------+---------+
//    |A  |B  |rcvr_nbr|order_qty|rcv_qty|item_nbr|edit_flag|
//    +---+---+--------+---------+-------+--------+---------+
//    |123|1  |500     |10       |2      |1001    |false    |
//    |123|1  |501     |10       |2      |1001    |false    |
//    |123|4  |502     |60       |5      |1001    |true     |
//    |123|2  |504     |40       |30     |1003    |false    |
//    |123|5  |510     |10       |10     |1003    |true     |
//    +---+---+--------+---------+-------+--------+---------+

val sourceDFTrueF = sourceDF.filter(col("edit_flag").equalTo(true))
val sourceDFTrue = sourceDFTrueF.columns.foldLeft(sourceDFTrueF) {
  (tmpDF, col) =>
    tmpDF.withColumnRenamed(col, s"${col}_true")
}

val sourceDFFalse = sourceDF
  .filter(col("edit_flag").equalTo(false))
  .dropDuplicates("item_nbr")

val resDF =
  sourceDFFalse
    .join(
      sourceDFTrue,
      sourceDFFalse.col("item_nbr") === sourceDFTrue.col("item_nbr_true"),
      "left"
    )
    .select(
      sourceDFFalse.col("A"),
      sourceDFFalse.col("B"),
      sourceDFTrue.col("rcvr_nbr_true").alias("rcvr_nbr"),
      sourceDFFalse.col("order_qty"),
      sourceDFTrue.col("rcv_qty_true").alias("rcv_qty"),
      sourceDFFalse.col("item_nbr"),
      sourceDFFalse.col("edit_flag")
    )
    .union(sourceDF)
    .orderBy(col("A"), col("item_nbr"), col("edit_flag"))
resDF.show(false)
//    +---+---+--------+---------+-------+--------+---------+
//    |A  |B  |rcvr_nbr|order_qty|rcv_qty|item_nbr|edit_flag|
//    +---+---+--------+---------+-------+--------+---------+
//    |123|1  |501     |10       |2      |1001    |false    |
//    |123|1  |500     |10       |2      |1001    |false    |
//    |123|1  |502     |10       |5      |1001    |false    |
//    |123|4  |502     |60       |5      |1001    |true     |
//    |123|2  |504     |40       |30     |1003    |false    |
//    |123|2  |510     |40       |10     |1003    |false    |
//    |123|5  |510     |10       |10     |1003    |true     |
//    +---+---+--------+---------+-------+--------+---------+

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM