[英]Parse a JSON column in a spark dataframe using Spark
輸入:
案例ID | 對象值 |
---|---|
1 | [{'dummyAcc':'12346','accountRequest':{'schemeCode':'ZEROQ1', 'CCZ':'SGD'}}] |
2 | [{'dummyAcc':'12347','accountRequest':{'schemeCode':'ZEROQ2', 'CCZ':'SGD'}}] |
3 | [{'dummyAcc':'12348','accountRequest':{'schemeCode':'ZEROQ5', 'CCZ':'SGD'}}] |
4 | [{'dummyAcc':'12349','accountRequest':{'schemeCode':'ZEROQ', 'CCZ':'SGD'}}] |
5 | [{'dummyAcc':'12350','accountRequest':{'schemeCode':'ZEROQ', 'CCZ':'SGD'}}] |
輸出:
案例ID | 方案代碼 | 中關村 |
---|---|---|
1 | 零Q1 | 新元 |
2 | 零Q2 | 新元 |
3 | 零Q5 | 新元 |
4 | 零質量 | 新元 |
5 | 零質量 | 新元 |
請指導我在 spark 中實現此輸出,我能夠使用小樣本數據在 python 中執行此操作,但由於生產中的數據量需要在 spark 中執行此操作。 提前致謝
要提取類似 json 的數據,請使用函數from_json
。 它需要一個模式作為輸入。 並且您的 JSON {"allowSingleQuotes": "true"}
,因此,您需要添加選項{"allowSingleQuotes": "true"}
。
from pyspark.sql import functions as F, types as T
schm = T.StructType(
[
T.StructField("dummyAcc", T.StringType()),
T.StructField(
"accountRequest",
T.StructType(
[
T.StructField("schemeCode", T.StringType()),
T.StructField("CCZ", T.StringType()),
]
),
),
]
)
df.withColumn(
"object_value",
F.from_json("object_value", schm, options={"allowSingleQuotes": "true"}),
).select(
"caseid",
"object_value.accountRequest.schemeCode",
"object_value.accountRequest.CCZ",
).show()
+------+----------+---+
|caseid|schemeCode|CCZ|
+------+----------+---+
| 1| ZEROQ1|SGD|
| 2| ZEROQ2|SGD|
| 3| ZEROQ5|SGD|
| 4| ZEROQ|SGD|
| 5| ZEROQ|SGD|
+------+----------+---+
您可以使用get_json_object
,它很簡單
import pyspark.sql.functions as f
df = spark.createDataFrame([
[1, """[{'dummyAcc':'12346','accountRequest':{'schemeCode':'ZEROQ1', 'CCZ':'SGD'}}]"""],
[2, """[{'dummyAcc':'12347','accountRequest':{'schemeCode':'ZEROQ2', 'CCZ':'SGD'}}]"""],
[3, """[{'dummyAcc':'12348','accountRequest':{'schemeCode':'ZEROQ5', 'CCZ':'SGD'}}]"""],
[4, """[{'dummyAcc':'12349','accountRequest':{'schemeCode':'ZEROQ', 'CCZ':'SGD'}}]"""],
[5, """[{'dummyAcc':'12350','accountRequest':{'schemeCode':'ZEROQ', 'CCZ':'SGD'}}]"""]
], schema='caseid int, object_value string')
final_df = (df
.select('caseid',
f.get_json_object('object_value', '$[*].accountRequest.schemeCode').alias('schemeCode'),
f.get_json_object('object_value', '$[*].accountRequest.CCZ').alias('CCZ')))
final_df.show(truncate=False)
# +------+----------+-----+
# |caseid|schemeCode|CCZ |
# +------+----------+-----+
# |1 |"ZEROQ1" |"SGD"|
# |2 |"ZEROQ2" |"SGD"|
# |3 |"ZEROQ5" |"SGD"|
# |4 |"ZEROQ" |"SGD"|
# |5 |"ZEROQ" |"SGD"|
# +------+----------+-----+
所以一位同事曾經告訴我regex_extract
比解析regex_extract
更快,我一直相信......直到今天我決定運行一些計時實驗來比較它使用get_json_object
和from_json
在這里發布的其他兩個解決方案。
簡短的回答是,即使我們通過添加數千個額外的 K:V 對來使 JSON 復雜化,所有這些都具有可比性。 regex_extract
方法在這些測試中實際上一直有點慢。
設置:證明每種方法都有效
import pyspark.sql.functions as fun
import pyspark.sql.types as t
case_ids = range(1,6)
data = [
'{"dummyAcc":"12346","accountRequest":{"schemeCode":"ZEROQ1", "CCZ":"SGD"}}',
'{"dummyAcc":"12347","accountRequest":{"schemeCode":"ZEROQ2", "CCZ":"SGD"}}',
'{"dummyAcc":"12348","accountRequest":{"schemeCode":"ZEROQ5", "CCZ":"SGD"}}',
'{"dummyAcc":"12349","accountRequest":{"schemeCode":"ZEROQ", "CCZ":"SGD"}}',
'{"dummyAcc":"12350","accountRequest":{"schemeCode":"ZEROQ", "CCZ":"SGD"}}'
]
df = spark.createDataFrame(pd.DataFrame({"caseid": case_ids, "object_value": data}))
##
# fun.from_json
##
schm = t.StructType(
[
t.StructField("dummyAcc", t.StringType()),
t.StructField(
"accountRequest",
t.StructType(
[
t.StructField("schemeCode", t.StringType()),
t.StructField("CCZ", t.StringType()),
]
),
),
]
)
def run_from_json(df):
return df.withColumn("object_value", fun.from_json("object_value", schm, options={"allowSingleQuotes": "true"}))\
.select(
"caseid",
"object_value.accountRequest.schemeCode",
"object_value.accountRequest.CCZ",
)
##
# get_json
##
def run_get_json(df):
return df.select('caseid',
fun.get_json_object('object_value', '$.accountRequest.schemeCode').alias('schemeCode'),
fun.get_json_object('object_value', '$.accountRequest.CCZ').alias('CCZ'))
##
# regexp_extract
##
def run_regexp_extract(df):
return df.withColumn("schemeCode", fun.regexp_extract(fun.col("object_value"), '(.)("schemeCode":")(\w+)', 3))\
.withColumn("CCZ", fun.regexp_extract(fun.col("object_value"), '(.)("CCZ":")(\w+)', 3))\
.select("caseid", "schemeCode", "CCZ")
##
# Test them out
##
print("from_json")
run_from_json(df).show(truncate=False)
print("get_json")
run_get_json(df).show(truncate=False)
print("regexp_extract")
run_regexp_extract(df).show(truncate=False)
from_json
+------+----------+---+
|caseid|schemeCode|CCZ|
+------+----------+---+
|1 |ZEROQ1 |SGD|
|2 |ZEROQ2 |SGD|
|3 |ZEROQ5 |SGD|
|4 |ZEROQ |SGD|
|5 |ZEROQ |SGD|
+------+----------+---+
get_json
+------+----------+---+
|caseid|schemeCode|CCZ|
+------+----------+---+
|1 |ZEROQ1 |SGD|
|2 |ZEROQ2 |SGD|
|3 |ZEROQ5 |SGD|
|4 |ZEROQ |SGD|
|5 |ZEROQ |SGD|
+------+----------+---+
regexp_extract
+------+----------+---+
|caseid|schemeCode|CCZ|
+------+----------+---+
|1 |ZEROQ1 |SGD|
|2 |ZEROQ2 |SGD|
|3 |ZEROQ5 |SGD|
|4 |ZEROQ |SGD|
|5 |ZEROQ |SGD|
+------+----------+---+
計時第 1 部分——使用短 JSON我使用上面定義的默認緊湊 JSON 檢查了運行多次迭代的掛鍾時間。
def time_run_method(df, n_it, meth, meth_name):
t0 = time.time()
for i in range(n_it):
meth(df).count()
td = time.time() - t0
print(n)
print("Time to count %d iterations: %s [sec]" % (n_it, "{:,}".format(td)))
for m, n in zip([run_from_json, run_get_json, run_regexp_extract], ["from_json", "get_json", "regexp_extract"]):
time_run_method(df, 200, m, n)
from_json
Time to count 200 iterations: 15.918861389160156 [sec]
get_json
Time to count 200 iterations: 15.668830871582031 [sec]
regexp_extract
Time to count 200 iterations: 17.539576292037964 [sec]
計時第 2 部分——使用長 JSON我向JSON添加了 2000 個鍵值對,以查看反序列化它們的額外開銷是否會改變事情。 它沒有。 也許這個結構太簡單了,內部解析器能夠簡單地避免額外的鍵,或者考慮到結構的扁平度,它們不會帶來很多開銷。 我不知道。
cruft = json.dumps({k:v for k,v in enumerate(range(2000))})
data = [
'{ "cruft": %s, "dummyAcc":"12346","accountRequest":{"schemeCode":"ZEROQ1", "CCZ":"SGD"}}' % cruft,
'{ "cruft": %s, "dummyAcc":"12347","accountRequest":{"schemeCode":"ZEROQ2", "CCZ":"SGD"}}' % cruft,
'{ "cruft": %s, "dummyAcc":"12348","accountRequest":{"schemeCode":"ZEROQ5", "CCZ":"SGD"}}' % cruft,
'{ "cruft": %s, "dummyAcc":"12349","accountRequest":{"schemeCode":"ZEROQ", "CCZ":"SGD"}}' % cruft,
'{ "cruft": %s, "dummyAcc":"12350","accountRequest":{"schemeCode":"ZEROQ", "CCZ":"SGD"}}' % cruft
]
df2 = spark.createDataFrame(pd.DataFrame({"caseid": case_ids, "object_value": data}))
for m, n in zip([run_from_json, run_get_json, run_regexp_extract], ["from_json", "get_json", "regexp_extract"]):
time_run_method(df2, 200, m, n)
from_json
Time to count 200 iterations: 16.005220413208008 [sec]
get_json
Time to count 200 iterations: 15.788024187088013 [sec]
regexp_extract
Time to count 200 iterations: 16.81353187561035 [sec]
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.