A table of 5-tuples (PRODUCT_ID, TRANSACTION_TYPE, QUANTITY, PRICE, DATE)
. Transaction_Type
could be one of "Buy" or "Sell". Quantity
is the number of instances of the product is bought or sold, for the Price
indicated on the Date
.
A product that is sold is offset against the inventory already in hand, and that too the earliest instance of that inventory.
Net Profit is calculated by offsetting the Sold inventory against the earliest Bought inventory, and if that doesn't fully address it, then use the next Bought inventory, and so on.
For instance, consider the following table values:
1, Buy, 10, 100.0, Jan 1
2, Buy, 20, 200.0, Jan 2
1, Buy, 15, 150.0, Jan 3
1, Sell, 5, 120.0, Jan 5
1, Sell, 10, 125.0, Jan 6
There are hundreds of files stored already on HDFS having the schema shown above.
Then the profit calculation should work as follows:
Below is the code snippet. But It does not work getting NullPointer Exception. Any better suggestion ?
import org.apache.spark.SparkContext._
import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import org.apache.spark.sql._
import org.apache.spark.rdd._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.Row
case class Inventory(PRODUCT_ID: Int, TRANSACTION_TYPE: String, QUANTITY: Long, PRICE: Double, DATE: String)
object MatchingInventory{
def main(args:Array[String])= {
val conf = new SparkConf().setAppName("XYZ")
val sc = new SparkContext(conf)
val sqlcontext = new SQLContext(sc)
// Create a schema RDD of Inventory objects from the data that has any number of text file.
import sqlcontext.implicits._
val dfInvent= sc.textFile("Invent.txt")
.map(_.split(","))
.map(p => Inventory(p(0).trim.toInt, p(1).trim, p(2).trim.toLong, p(3).trim.toDouble, p(4).trim))
.toDF().cache()
dfInvent.show()
val idDF = dfInvent.map{row => row.getInt(0)}.distinct
//idDF.show()
val netProfit = sc.accumulator(0.0)
idDF.foreach{id =>
val sellDF = dfInvent.filter((dfInvent("PRODUCT_ID").contains(id)) && (dfInvent("TRANSACTION_TYPE").contains("Sell")))
val buyDF = dfInvent.filter((dfInvent("PRODUCT_ID").contains(id)) && (dfInvent("TRANSACTION_TYPE").contains("Buy")))
var soldQ:Long = sellDF.map{row => row.getLong(2)}.reduce(_+_)
var sellPrice:Double = sellDF.map{row => row.getLong(2)*row.getDouble(3)}.reduce(_+_) //reduce sends the result back to driver
var profit:Double = 0.0
// profit for each bought item
buyDF.foreach{row =>
if((soldQ > 0) && (soldQ < row.getLong(2))){profit += sellPrice -(soldQ*row.getDouble(3));soldQ = 0}
else if((soldQ > 0) && (soldQ > row.getLong(2))){profit += sellPrice - (row.getLong(2)*row.getDouble(3));soldQ = soldQ - row.getLong(2)}
else{}}
netProfit += profit}
println("Inventory net Profit" + netProfit)
}
}
I tried something like this. This is a workable code, only issue is I am using collect at the later stage to sync between buy and sell which will lead to memory issues for large data.
from pyspark.sql import SQLContext
from pyspark import SparkConf
from pyspark import SparkContext
import sys
from pyspark.sql.functions import *
if __name__ == "__main__":
sc = SparkContext()
sqlContext = SQLContext(sc)
df = sqlContext.read.format('com.databricks.spark.csv').options(header='false', inferschema='true').load('test.csv')
df = df.withColumn("C1", ltrim(df.C1))
df.registerTempTable("tempTable")
df = sqlContext.sql("select * from tempTable order by C0")
dt = df.map(lambda s: (str(s[0])+'-'+ s[1], str(s[2]) + ',' +str(s[3])))
dt = dt.reduceByKey(lambda a, b : a + '-' + b)
ds = dt.collect()
dicTran = {}
for x in ds:
key = (x[0].split('-'))[0]
tratype = (x[0].split('-'))[1]
val = {}
if key in dicTran:
val = dicTran[key]
val[tratype] = x[1]
dicTran[key] = val
profit = 0
for key, value in dicTran.iteritems():
if 'Sell' in value:
buy = value['Buy']
sell = value['Sell']
ls = sell.split('-')
sellAmount = 0
sellquant = 0
for x in ls:
y = x.split(',')
sellAmount= sellAmount + float(y[0]) * float(y[1])
sellquant = sellquant + float(y[0])
lb = buy.split('-')
for x in lb:
y = x.split(',')
if float(y[0]) >= sellquant:
profit += sellAmount - sellquant * float(y[1])
else:
sellAmount -= float(y[0]) * float(y[1])
sellquant -= float(y[0])
print 'profit', profit
#
Here is the logic I thought
1) For all same ids and transaction type I concat the quantity and price via a separator 2) Then I collect and split them to calculate the profit
I know this will crash on large data sets as collect is used but could not thing of anything better. I will try out your solution as well.
So here I come up with a solution
import org.apache.spark.SparkContext._
import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import org.apache.spark.sql._
import org.apache.spark.rdd._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.Row
import java.text.SimpleDateFormat
import java.sql.Date
import scala.math.Ordering
//Defining Schema
case class Inventory(PRODUCT_ID: Int, TRANSACTION_TYPE: String, QUANTITY: Long, PRICE: Double, pDate:java.sql.Date)
object MatchingInventory{
def main(args:Array[String])= {
val conf = new SparkConf().setAppName("XYZ")
val sc = new SparkContext(conf)
val sqlcontext = new SQLContext(sc)
import sqlcontext.implicits._
val format = new SimpleDateFormat("MMM d")
//Read data from directory which has multple files
val dfInvent= sc.textFile("data/*.txt")
.map(_.split(","))
.map(p => Inventory(p(0).trim.toInt, p(1).trim, p(2).trim.toLong, p(3).trim.toDouble, new Date(format.parse(p(4)).getTime)))
.cache()
def calculateProfit(data:Iterable[Inventory]):Double = {
var soldQ:Long = 0
var sellPrice:Double = 0
var profit:Double = 0
val v = data
for(i <- v ){
if(i.TRANSACTION_TYPE == "Sell")
{
soldQ = soldQ + i.QUANTITY
profit = profit+ i.PRICE*i.QUANTITY
}
}
for(i <- v){
if(i.TRANSACTION_TYPE == "Buy")
{
if((soldQ > 0) && (soldQ < i.QUANTITY || soldQ == i.QUANTITY)){profit = profit -(soldQ*i.PRICE);soldQ = 0}
else if((soldQ > 0) && (soldQ > i.QUANTITY)){profit = profit - (i.QUANTITY*i.PRICE);soldQ = soldQ - i.QUANTITY}
else{}
}
}
profit
}
val key: RDD[((Int), Iterable[Inventory])] = dfInvent.keyBy(r => (r.PRODUCT_ID)).groupByKey
val values: RDD[((Int), List[Inventory])] = key.mapValues(v => v.toList.sortBy(_.pDate.getTime))
val pro = values.map{ case(k,v) => (k, calculateProfit(v))}
val netProfit = pro.map{ case(k,v) => v}.reduce(_+_)
println("Inventory NetProfit" + netProfit)
}
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.