Working with Python 3.7.9, Spark 2.4.5 I am trying to manually "try parse" a given subset of columns from string to integer, and then add two extra columns to the dataframe:
I was able to calculate "_num_invalid_columns", but am having issues with "_invalid_columns_list". Code to reproduce below, I reduced it as much as possible.
'''
Uncomment these 2 lines if using Jupyter Notebook
import findspark
findspark.init()
'''
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql import functions as F
def tryparse_integer(integer_str):
'''
if integer_str is None returns None as IntegerType()
if integer_str is not None and cannot be parsed to integer, returns -9999 as IntegerType()
if integer_str is not None and can be parsed to integer, returns integer_str as IntegerType()
(that way we can tell between null data and invalid data)
'''
return F.when(F.isnull(integer_str), F.lit(None).cast(IntegerType())) \
.otherwise( \
F.when(F.isnull(integer_str.cast(IntegerType())), F.lit(-9999).cast(IntegerType())) \
.otherwise(integer_str.cast(IntegerType())) \
)
def is_invalid_number(col):
return F.when(col == -9999, 1).otherwise(0)
spark = SparkSession.builder.appName("RandallTest").getOrCreate()
data = [('1', '2','hello'), ('error','error','hello'), ('error','2','hello')]
schema = StructType([
StructField('column1', StringType()),
StructField('column2', StringType()),
StructField('column3', StringType())
])
df = spark.createDataFrame(data, schema = schema)
df.printSchema()
integerColumns = ['column1','column2']
df_parsed = df.select(*[
tryparse_integer(F.col(colName)).alias(colName) if (colName in integerColumns)
else colName
for colName in df.columns])
df_parsed.printSchema()
df_parsed_with_errorcount = df_parsed \
.withColumn('_num_invalid_columns', sum(
is_invalid_number(F.col(colName)) if (colName in integerColumns)
else 0
for colName in df_parsed.columns)) \
.withColumn('_invalid_columns_list', F.lit('--'.join(filter(None, (
##Not what I need, but works:
colName if (colName in integerColumns)
##Not working if I uncomment the actual logic I want. Something like any of these lines (59, 60 or 61 all produce errors)
##colName if (colName in integerColumns and F.col(colName) == -9999)
##colName if (colName in integerColumns & F.col(colName) == -9999)
##colName if (colName in integerColumns & is_invalid_number(F.col(colName)) == 1)
else None
for colName in df_parsed.columns)))))
df_parsed_with_errorcount.show()
df_parsed_with_errorcount.take(10)
Example Input:
column1 column2 column3
'1' '2' 'hello'
'error' 'error' 'hello'
'error' '2' 'hello'
Columns to "try parse": column1, column2
Expected output:
column1 column2 column3 _num_invalid_columns _invalid_columns_list
1 2 'hello' 0
-9999 -9999 'hello' 2 column1,column2
-9999 2 'hello' 1 column1
Use F.lit(colName)
to put your column names into the dataframe:
df_parsed_with_errorcount = df_parsed.withColumn(
'_invalid_columns_list',
F.concat_ws(
',',
*[F.when(is_invalid_number(F.col(colName)) == 1, F.lit(colName)) for colName in df_parsed.columns]
)
)
df_parsed_with_errorcount.show()
+-------+-------+-------+---------------------+
|column1|column2|column3|_invalid_columns_list|
+-------+-------+-------+---------------------+
| 1| 2| hello| |
| -9999| -9999| hello| column1,column2|
| -9999| 2| hello| column1|
+-------+-------+-------+---------------------+
Expanding upon input from mck, my final code looks like this:
First a modification to the helper function to make it "type safe", so to speak
def is_invalid_number(col):
##return F.when(col == -9999, 1).otherwise(0) --> does not work, no type safety
return F.when(col.cast(StringType()) == '-9999', 1).otherwise(0)
And then the actual column calculation using concat_ws() + array_contains() + my helper function
df_parsed_with_errorcount = df_parsed \
.withColumn('_num_invalid_columns', sum(
is_invalid_number(F.col(colName)) if (colName in integerColumns)
else 0
for colName in df_parsed.columns)) \
.withColumn('_invalid_columns_list', F.concat_ws(',', *[ \
(F.when(F.array_contains(F.array([F.lit(x) for x in integerColumns]), colName), F.when(is_invalid_number(F.col(colName)) == 1, F.lit(colName))) \
.otherwise(F.lit(None))
) for colName in df_parsed.columns]))
From what I can tell from the error messages shown before making the is_invalid_number helper function type safe; Spark does not guarantee the order of execution of the conditions within the when(), even after nesting one when() function within another, as opposed to using one single when() function with two conditions separated by & (and)
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.