[英]Find average by department in spark groupBy in Java 1.8
我有一個下面的數據集,其中第一列是部門,第二列是薪水。 我想按部門計算平均工資。
IT 2000000
HR 2000000
IT 1950000
HR 2200000
Admin 1900000
IT 1900000
IT 2200000
我執行了以下操作
JavaPairRDD<String, Iterable<Long>> rddY = employees.groupByKey();
System.out.println("<=========================RDDY collect==================>" + rddY.collect());
並得到以下輸出:
<=========================RDDY
collect==================>[(IT,[2000000, 1950000, 1900000, 2200000]),
(HR,[2000000, 2200000]), (Admin,[1900000])]
我需要的是
我想通過使用火花RDD計算總平均值和部門明智平均值。
如何在spark中使用groupBy函數計算平均值。
以下是使用Spark JavaPairRDD按鍵計算平均值的代碼。 希望這可以幫助。
import java.util.ArrayList;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
public class SparkAverageCalculation {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("Average Calculation").setMaster("local[2]");
JavaSparkContext sc = new JavaSparkContext(conf);
//inputList
List<Tuple2<String,Integer>> inputList = new ArrayList<Tuple2<String,Integer>>();
inputList.add(new Tuple2<String,Integer>("a1", 30));
inputList.add(new Tuple2<String,Integer>("b1", 30));
inputList.add(new Tuple2<String,Integer>("a1", 40));
inputList.add(new Tuple2<String,Integer>("a1", 20));
inputList.add(new Tuple2<String,Integer>("b1", 50));
//parallelizePairs
JavaPairRDD<String, Integer> pairRDD = sc.parallelizePairs(inputList);
//count each values per key
JavaPairRDD<String, Tuple2<Integer, Integer>> valueCount = pairRDD.mapValues(value -> new Tuple2<Integer, Integer>(value,1));
//add values by reduceByKey
JavaPairRDD<String, Tuple2<Integer, Integer>> reducedCount = valueCount.reduceByKey((tuple1,tuple2) -> new Tuple2<Integer, Integer>(tuple1._1 + tuple2._1, tuple1._2 + tuple2._2));
//calculate average
JavaPairRDD<String, Integer> averagePair = reducedCount.mapToPair(getAverageByKey);
//print averageByKey
averagePair.foreach(data -> {
System.out.println("Key="+data._1() + " Average=" + data._2());
});
//stop sc
sc.stop();
sc.close();
}
private static PairFunction<Tuple2<String, Tuple2<Integer, Integer>>,String,Integer> getAverageByKey = (tuple) -> {
Tuple2<Integer, Integer> val = tuple._2;
int total = val._1;
int count = val._2;
Tuple2<String, Integer> averagePair = new Tuple2<String, Integer>(tuple._1, total / count);
return averagePair;
};
}
import org.apache.htrace.fasterxml.jackson.databind.ObjectMapper;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.elasticsearch.spark.rdd.api.java.JavaEsSpark;
import scala.Tuple2;
import java.util.Map;
public class ElasticsearchMetricProcessor {
private static final String ES_HOST_PORT = "localhost:9200";
private static PairFunction<Tuple2<String, Tuple2<Long, Integer>>,String,Long> getAverageByKey = (tuple) -> {
Tuple2<Long, Integer> val = tuple._2;
long total = val._1;
int count = val._2;
Tuple2<String, Long> averagePair = new Tuple2<String, Long>(tuple._1, total / count);
return averagePair;
};
public static void main(String args[]) throws InterruptedException {
System.setProperty("hadoop.home.dir","C:\\Users\\anki\\metering\\winutils");
SparkConf sparkConf = new SparkConf().setAppName("StreamingApp").setMaster("local[2]");
sparkConf.set("es.nodes.wan.only","false");
sparkConf.set("es.nodes",ES_HOST_PORT);
JavaStreamingContext jsc = new JavaStreamingContext(sparkConf, Durations.seconds(10));
JavaRDD<Map<String, Object>> esRDD = JavaEsSpark.esRDD(jsc.sparkContext(), "portal_analytics/report-execution").values();
JavaPairRDD<String, Tuple2<Long, Integer>> valueCount = esRDD.mapToPair( x -> new Tuple2<String, Long>(x.get("id").toString(),Long.valueOf(x.get("duration").toString()))).mapValues(value -> new Tuple2<Long, Integer>(value,1));
JavaPairRDD<String, Tuple2<Long, Integer>> reducedCount = valueCount.reduceByKey((tuple1,tuple2) -> new Tuple2<Long, Integer>(tuple1._1 + tuple2._1, tuple1._2 + tuple2._2));
//calculate average
JavaPairRDD<String, Long> averagePair = reducedCount.mapToPair(getAverageByKey);
//print averageByKey
averagePair.foreach(data -> {
System.out.println("Key="+data._1() + " Average=" + data._2());
});
//stop sc
jsc.stop();
jsc.close();
}
}
--------------------------------------------------------
Elasticsearch Test Data
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 3,
"successful": 3,
"failed": 0
},
"hits": {
"total": 16,
"max_score": 1,
"hits": [
{
"_index": "portal_analytics",
"_type": "report-execution",
"_id": "AVvS8aPGm2uMcgoWFwdx",
"_score": 1,
"_source": {
"type": "report-execution",
"id": "a37cacc3-71d5-40f0-a329-a051a3949ced",
"date-time": 1475733719123,
"tenant": "default",
"user": "317f1e761f2faa8da781a4762b9dcc2c5cad209a",
"report": "72efd670-bb95-11e5-632f-54ee7539b24c",
"duration": 30
}
},
{
"_index": "portal_analytics",
"_type": "report-execution",
"_id": "AVvS8eOcm2uMcgoWFwd3",
"_score": 1,
"_source": {
"type": "report-execution",
"id": "a37cacc3-71d5-40f0-a329-a051a3949ced",
"date-time": 1475733719123,
"tenant": "default",
"user": "317f1e761f2faa8da781a4762b9dcc2c5cad209a",
"report": "72efd670-bb95-11e5-632f-54ee7539b24c",
"duration": 30
}
},
{
"_index": "portal_analytics",
"_type": "report-execution",
"_id": "AVvTL5ACm2uMcgoWFweC",
"_score": 1,
"_source": {
"type": "report-execution",
"id": "b37cacc3-71d5-40f0-a329-a051a3949ced",
"date-time": 1475733719123,
"tenant": "default",
"user": "317f1e761f2faa8da781a4762b9dcc2c5cad209a",
"report": "72efd670-bb95-11e5-632f-54ee7539b24c",
"duration": 70
}
},
{
"_index": "portal_analytics",
"_type": "report-execution",
"_id": "AVvTL96Xm2uMcgoWFweD",
"_score": 1,
"_source": {
"type": "report-execution",
"id": "b37cacc3-71d5-40f0-a329-a051a3949ced",
"date-time": 1475733719123,
"tenant": "default",
"user": "317f1e761f2faa8da781a4762b9dcc2c5cad209a",
"report": "72efd670-bb95-11e5-632f-54ee7539b24c",
"duration": 30
}
},
{
"_index": "portal_analytics",
"_type": "report-execution",
"_id": "AVvTNrKPm2uMcgoWFweF",
"_score": 1,
"_source": {
"type": "report-execution",
"id": "b37cacc3-71d5-40f0-a329-a051a3949ced",
"date-time": 1475733719123,
"tenant": "default",
"user": "317f1e761f2faa8da781a4762b9dcc2c5cad209a",
"report": "72efd670-bb95-11e5-632f-54ee7539b24c",
"duration": 30
}
},
{
"_index": "portal_analytics",
"_type": "report-execution",
"_id": "AVvS8dWFm2uMcgoWFwdy",
"_score": 1,
"_source": {
"type": "report-execution",
"id": "a37cacc3-71d5-40f0-a329-a051a3949ced",
"date-time": 1475733719123,
"tenant": "default",
"user": "317f1e761f2faa8da781a4762b9dcc2c5cad209a",
"report": "72efd670-bb95-11e5-632f-54ee7539b24c",
"duration": 30
}
},
{
"_index": "portal_analytics",
"_type": "report-execution",
"_id": "AVvS8dlim2uMcgoWFwdz",
"_score": 1,
"_source": {
"type": "report-execution",
"id": "a37cacc3-71d5-40f0-a329-a051a3949ced",
"date-time": 1475733719123,
"tenant": "default",
"user": "317f1e761f2faa8da781a4762b9dcc2c5cad209a",
"report": "72efd670-bb95-11e5-632f-54ee7539b24c",
"duration": 30
}
},
{
"_index": "portal_analytics",
"_type": "report-execution",
"_id": "AVvS8d7am2uMcgoWFwd1",
"_score": 1,
"_source": {
"type": "report-execution",
"id": "a37cacc3-71d5-40f0-a329-a051a3949ced",
"date-time": 1475733719123,
"tenant": "default",
"user": "317f1e761f2faa8da781a4762b9dcc2c5cad209a",
"report": "72efd670-bb95-11e5-632f-54ee7539b24c",
"duration": 30
}
},
{
"_index": "portal_analytics",
"_type": "report-execution",
"_id": "AVvS8eX0m2uMcgoWFwd4",
"_score": 1,
"_source": {
"type": "report-execution",
"id": "a37cacc3-71d5-40f0-a329-a051a3949ced",
"date-time": 1475733719123,
"tenant": "default",
"user": "317f1e761f2faa8da781a4762b9dcc2c5cad209a",
"report": "72efd670-bb95-11e5-632f-54ee7539b24c",
"duration": 30
}
},
{
"_index": "portal_analytics",
"_type": "report-execution",
"_id": "AVvS8nplm2uMcgoWFwd7",
"_score": 1,
"_source": {
"type": "report-execution",
"id": "a37cacc3-71d5-40f0-a329-a051a3949ced",
"date-time": 1475733719123,
"tenant": "default",
"user": "317f1e761f2faa8da781a4762b9dcc2c5cad209a",
"report": "72efd670-bb95-11e5-632f-54ee7539b24c",
"duration": 50
}
}
],
}
}
鍵= b37cacc3-71d5-40f0-a329-a051a3949ced平均值= 50鍵= a37cacc3-71d5-40f0-a329-a051a3949ced平均值= 37
上面的解決方案是用Java編寫的,對於任何正在使用scala的人,都可以嘗試下面的解決方案。
val mapp = data.map(x => x.split(" "))
val dept = mapp.map( x => (x(0),(x(1).toInt,1)))
val avg = dept.reduceByKey((x,y) => ((x._1+y._1),(x._2+y._2)))
val count = avg.mapValues{case (x,y) => x/y}
count.foreach(println)
產量
(Admin,1900000)
(HR,4200000)
(IT,8050000)
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.