I would like to aggregate a column values (json) in spark dataframe and hive table.
eg
year, month, val (json)
2010 01 [{"a_id":"caes"},{"a_id":"rgvtsa"},{"a_id":"btbsdv"}]
2010 01 [{"a_id":"caes"},{"a_id":"uktf"},{"a_id":"ohcwa"}]
2008 10 [{"a_id":"rfve"},{"a_id":"yjndf"},{"a_id":"onbds"}]
2008 10 [{"a_id":"fvds"},{"a_id":"yjndf"},{"a_id":"yesva"}]
I need:
year, month, val (json), num (int)
2010 01 [{"a_id":"caes"},{"a_id":"rgvtsa"},{"a_id":"btbsdv},{"a_id":"uktf"}, {"a_id":"ohcwa"}] 5
2008 10 [{"a_id":"rfve"},{"a_id":"yjndf"},{"a_id":"onbds"},{"a_id":"yesva"}] 4
I need to remove the duplicates and also find the size of the json string (num of "a_id") in it.
The data is saved as a hive table so it could be better to work on it by pyspark sql?
I also would like to know how to work on it if it is saved as a spark dataframe.
I have tried:
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType
schema = StructType(
[
StructField('a_id', StringType(), True)
]
)
df.withColumn("val", from_json("val", schema))\
.select(col('year'), col('month'), col('val.*'))\
.show()
But, all values in "val1" are null.
thanks
UPDTAE my hive version:
%sh
ls /databricks/hive | grep "hive"
spark--maven-trees--spark_1.4_hive_0.13
My DDL:
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql.types import *
def concate_elements(val):
return reduce (lambda x, y:x+y, val)
flatten_array = F.udf(concate_elements, T.ArrayType(T.StringType()))
remove_duplicates = udf(lambda row: list(set(row)),
ArrayType(StringType()))
#final results
df.select("year","month", flatten_array("val").alias("flattenvalues")).withColumn("uniquevalues", remove_duplicates("flattenvalues")).withColumn("size",F.size("uniquevalues")).show()
considered input data input Json file json-input.json
{"year":"2010","month":"01","value":[{"a_id":"caes"},{"a_id":"uktf"},{"a_id":"ohcwa"}]}
{"year":"2011","month":"01","value":[{"a_id":"caes"},{"a_id":"uktf"},{"a_id":"uktf"},{"a_id":"sathya"}]}
Approach 1. Read data from hive
1. insert data into hive
ADD JAR /home/sathya/Downloads/json-serde-1.3.7-jar-with-dependencies.jar
CREATE EXTERNAL TABLE json_table (
year string,
month string,
value array<struct<a_id:string>>)
ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe';
load data local inpath '/home/sathya/json-input.json' into table json_table;
select * from json_table;
OK
2010 01 [{"a_id":"caes"},{"a_id":"uktf"},{"a_id":"ohcwa"}]
2011 01 [{"a_id":"caes"},{"a_id":"uktf"},{"a_id":"uktf"},{"a_id":"sathya"}]
2. Read data from spark:
pyspark --jars /home/sathya/Downloads/json-serde-1.3.7-jar-with-dependencies.jar --driver-class-path /home/sathya/Downloads/json-serde-1.3.7-jar-with-dependencies.jar
df=spark.sql("select * from default.json_table")
df.show(truncate=False)
'''
+----+-----+----------------------------------+
|year|month|value |
+----+-----+----------------------------------+
|2010|01 |[[caes], [uktf], [ohcwa]] |
|2011|01 |[[caes], [uktf], [uktf], [sathya]]|
+----+-----+----------------------------------+
'''
#UDFs for concatenating the array elements & removing duplicates in an array
def concate_elements(val):
return reduce (lambda x, y:x+y, val)
flatten_array = F.udf(concate_elements, T.ArrayType(T.StringType()))
remove_duplicates = udf(lambda row: list(set(row)), ArrayType(StringType()))
#final results
df.select("year","month",flattenUdf("value").alias("flattenvalues")).withColumn("uniquevalues", remove_duplicates("flattenvalues")).withColumn("size",size("uniquevalues")).show()
'''
+----+-----+--------------------------+--------------------+----+
|year|month|flattenvalues |uniquevalues |size|
+----+-----+--------------------------+--------------------+----+
|2010|01 |[caes, uktf, ohcwa] |[caes, uktf, ohcwa] |3 |
|2011|01 |[caes, uktf, uktf, sathya]|[caes, sathya, uktf]|3 |
+----+-----+--------------------------+--------------------+----+
'''
Approach 2 - direct read from input Json file json-input.json
{"year":"2010","month":"01","value":[{"a_id":"caes"},{"a_id":"uktf"},{"a_id":"ohcwa"}]}
{"year":"2011","month":"01","value":[{"a_id":"caes"},{"a_id":"uktf"},{"a_id":"uktf"},{"a_id":"sathya"}]}
code for your scenario is:
import os
import logging
from pyspark.sql import SQLContext,SparkSession
from pyspark import SparkContext
from pyspark.sql.types import *
from pyspark.sql import functions as F
import pyspark.sql.types as T
df=spark.read.json("file:///home/sathya/json-input.json")
df.show(truncate=False)
'''
+-----+----------------------------------+----+
|month|value |year|
+-----+----------------------------------+----+
|01 |[[caes], [uktf], [ohcwa]] |2010|
|01 |[[caes], [uktf], [uktf], [sathya]]|2011|
+-----+----------------------------------+----+
'''
#UDFs for concatenating the array elements & removing duplicates in an array
def concate_elements(val):
return reduce (lambda x, y:x+y, val)
flatten_array = F.udf(concate_elements, T.ArrayType(T.StringType()))
remove_duplicates = udf(lambda row: list(set(row)), ArrayType(StringType()))
#final results
df.select("year","month",flattenUdf("value").alias("flattenvalues")).withColumn("uniquevalues", remove_duplicates("flattenvalues")).withColumn("size",size("uniquevalues")).show()
'''
+----+-----+--------------------------+--------------------+----+
|year|month|flattenvalues |uniquevalues |size|
+----+-----+--------------------------+--------------------+----+
|2010|01 |[caes, uktf, ohcwa] |[caes, uktf, ohcwa] |3 |
|2011|01 |[caes, uktf, uktf, sathya]|[caes, sathya, uktf]|3 |
+----+-----+--------------------------+--------------------+----+
'''
Here is a solution that'll work in Databricks:
#Import libraries
from pyspark.sql.functions import *
from pyspark.sql.types import *
#Define schema
schema1=StructType([
StructField('year',IntegerType(),True),
StructField('month',StringType(),True),
StructField('val',ArrayType(StructType([
StructField('a_id',StringType(),True)
])))
])
#Test data
rowsArr=[
[2010,'01',[{"a_id":"caes"},{"a_id":"rgvtsa"},{"a_id":"btbsdv"}]],
[2010,'01',[{"a_id":"caes"},{"a_id":"uktf"},{"a_id":"ohcwa"}]],
[2008,'10',[{"a_id":"rfve"},{"a_id":"yjndf"},{"a_id":"onbds"}]],
[2008,'10',[{"a_id":"fvds"},{"a_id":"yjndf"},{"a_id":"yesva"}]]
]
#Create dataframe
df1=(spark
.createDataFrame(rowsArr,schema=schema1)
)
#Create database
spark.sql('CREATE DATABASE IF NOT EXISTS testdb')
#Dump it into hive table
(df1
.write
.mode('overwrite')
.options(schema=schema1)
.saveAsTable('testdb.testtable')
)
#read from hive table
df_ht=(spark
.sql('select * from testdb.testtable')
)
#Perform transformation
df2=(df_ht
.groupBy('year','month')
.agg(array_distinct(flatten(collect_list('val'))).alias('val'))
.withColumn('num',size('val'))
)
Input DF:
Output DF:
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.