[英]PythonException with UDF in Spark Dataframe
大家好!
我编写了两个函数来清理数据集中的文本。 数据集的结构如下:
|-- content: string (nullable = true)
|-- score: string (nullable = true)
大约有1,300,000 行。 这是正确执行的功能,用于删除标点符号。 它有效并且不会出现问题:
import string
def remove_punct(text):
return text.translate(str.maketrans('', '', string.punctuation))
from pyspark.sql.functions import uff
punct_remove = udf(lambda s: remove_punct(s))
df_without_punct = new_df.withColumn("content", punct_remove(new_df["content"]))
这是用于删除 Emoji 的:
import re
def removeEmoji(text):
regrex_pattern = re.compile(pattern = "["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
"]+", flags = re.UNICODE)
return regrex_pattern.sub(r'',text)
removeEmoji = udf(lambda s: removeEmoji(s))
df_without_emoji = df_without_punct.withColumn("content", removeEmoji(df_without_punct["content"]))
当我显示 df 时返回以下错误:
---------------------------------------------------------------------------
PythonException Traceback (most recent call last)
<ipython-input-50-b7725dba3861> in <module>()
----> 1 df_without_emoji.show(50, False)
2 frames
/usr/local/lib/python3.7/dist-packages/pyspark/sql/dataframe.py in show(self, n, truncate, vertical)
484 print(self._jdf.showString(n, 20, vertical))
485 else:
--> 486 print(self._jdf.showString(n, int(truncate), vertical))
487
488 def __repr__(self):
/usr/local/lib/python3.7/dist-packages/py4j/java_gateway.py in __call__(self, *args)
1303 answer = self.gateway_client.send_command(command)
1304 return_value = get_return_value(
-> 1305 answer, self.gateway_client, self.target_id, self.name)
1306
1307 for temp_arg in temp_args:
/usr/local/lib/python3.7/dist-packages/pyspark/sql/utils.py in deco(*a, **kw)
115 # Hide where the exception came from that shows a non-Pythonic
116 # JVM exception message.
--> 117 raise converted from None
118 else:
119 raise
PythonException:
An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
File "/usr/local/lib/python3.7/dist-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 604, in main
process()
File "/usr/local/lib/python3.7/dist-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 596, in process
serializer.dump_stream(out_iter, outfile)
File "/usr/local/lib/python3.7/dist-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 211, in dump_stream
self.serializer.dump_stream(self._batched(iterator), stream)
File "/usr/local/lib/python3.7/dist-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 132, in dump_stream
for obj in iterator:
File "/usr/local/lib/python3.7/dist-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 200, in _batched
for item in iterator:
File "/usr/local/lib/python3.7/dist-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 450, in mapper
result = tuple(f(*[a[o] for o in arg_offsets]) for (arg_offsets, f) in udfs)
File "/usr/local/lib/python3.7/dist-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 450, in <genexpr>
result = tuple(f(*[a[o] for o in arg_offsets]) for (arg_offsets, f) in udfs)
File "/usr/local/lib/python3.7/dist-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 85, in <lambda>
return lambda *a: f(*a)
File "/usr/local/lib/python3.7/dist-packages/pyspark/python/lib/pyspark.zip/pyspark/util.py", line 73, in wrapper
return f(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 77, in <lambda>
return lambda *a: g(f(*a))
File "<ipython-input-48-097359320a3e>", line 1, in <lambda>
File "/usr/local/lib/python3.7/dist-packages/pyspark/sql/udf.py", line 199, in wrapper
return self(*args)
File "/usr/local/lib/python3.7/dist-packages/pyspark/python/lib/pyspark.zip/pyspark/sql/udf.py", line 177, in __call__
judf = self._judf
File "/usr/local/lib/python3.7/dist-packages/pyspark/python/lib/pyspark.zip/pyspark/sql/udf.py", line 161, in _judf
self._judf_placeholder = self._create_judf()
File "/usr/local/lib/python3.7/dist-packages/pyspark/python/lib/pyspark.zip/pyspark/sql/udf.py", line 167, in _create_judf
spark = SparkSession.builder.getOrCreate()
File "/usr/local/lib/python3.7/dist-packages/pyspark/python/lib/pyspark.zip/pyspark/sql/session.py", line 228, in getOrCreate
sc = SparkContext.getOrCreate(sparkConf)
File "/usr/local/lib/python3.7/dist-packages/pyspark/python/lib/pyspark.zip/pyspark/context.py", line 384, in getOrCreate
SparkContext(conf=conf or SparkConf())
File "/usr/local/lib/python3.7/dist-packages/pyspark/python/lib/pyspark.zip/pyspark/context.py", line 136, in __init__
SparkContext._assert_on_driver()
File "/usr/local/lib/python3.7/dist-packages/pyspark/python/lib/pyspark.zip/pyspark/context.py", line 1277, in _assert_on_driver
raise Exception("SparkContext should only be created and accessed on the driver.")
Exception: SparkContext should only be created and accessed on the driver.
我不知道它是什么......任何人都可以帮助我?
提前致谢!
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.