I am trying to replace all the Unicode characters in a column value to its appropriate values.
My dataframe has columns where data will have character like "<REFERENCE ID=11458 TYPE=trademark/>"
i have a dictionary created for all the Unicode and used it inside a for loop
carat_dict = {"<":'<', '>':'>','<':'<', '>':'>'}
for key,val in carat_dict.items():
df = carat_repl.withColumn("SYNONYMS_ENGLISH_1", regexp_replace("SYNONYMS_ENGLISH_1",key,val))
this works but i have a bigger dictionary with over 100 key and value pairs and when i loop over it i get stackoverflow error, is there a better way to solve this?
Tried the implementation with reduce and regex sub , python regex explanation can be found here
input_str = """
<REFERENCE ID=11458 TYPE=trademark/>,
<REFERENCE ID=11458 TYPE=trademark/<,
>REFERENCE ID=11458 TYPE=trademark/<,
>REFERENCE ID=11458 TYPE=trademark/>,
""".split(",")
input_values = list(map(lambda x: x.strip() if x.strip() != 'null' else None, input_str))
cols = list(map(lambda x: x.strip() if x.strip() != 'null' else None, "SYNONYMS_ENGLISH_1".split(",")))
n = len(input_values)
n_col = 1
input_list = [tuple(input_values[i:i+n_col]) for i in range(0,n,n_col)]
sparkDF = sql.createDataFrame(input_list, cols)
sparkDF.show(truncate=False)
+--------------------------------------------+
|SYNONYMS_ENGLISH_1 |
+--------------------------------------------+
|<REFERENCE ID=11458 TYPE=trademark/> |
|<REFERENCE ID=11458 TYPE=trademark/< |
|>REFERENCE ID=11458 TYPE=trademark/<|
|>REFERENCE ID=11458 TYPE=trademark/>|
+--------------------------------------------+
@F.udf(StringType())
def replace_unicode(inp):
carat_dict = {"<":'<', '>':'>','<':'<', '>':'>'}
# the simplest, lambda-based implementation
def multiple_replace(adict, text):
# Create a regular expression from all of the dictionary keys
regex = re.compile("|".join(map(re.escape, adict.keys( ))))
# For each match, look up the corresponding value in the dictionary
return regex.sub(lambda match: adict[match.group(0)], text)
return multiple_replace(carat_dict,inp)
sparkDF = reduce(
lambda df, x: df.withColumn(x, replace_unicode(F.col(x))),
['SYNONYMS_ENGLISH_1'],
sparkDF,
)
sparkDF.show(truncate=False)
+------------------------------------+
|SYNONYMS_ENGLISH_1 |
+------------------------------------+
|<REFERENCE ID=11458 TYPE=trademark/>|
|<REFERENCE ID=11458 TYPE=trademark/<|
|>REFERENCE ID=11458 TYPE=trademark/<|
|>REFERENCE ID=11458 TYPE=trademark/>|
+------------------------------------+
Solved using the following way, still few html entities don't get parsed
import html
def basic_data(ip_df):
df = ip_df
df = df.select([col(c).cast("string") for c in df.columns])
for df_col in df.columns:
df = df.withColumn(df_col, html_encode(df_col))
return df
@F.udf
def html_encode(Columntext):
Columntext
if Columntext is None:
Columntext = ""
return html.unescape(Columntext)
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.