replace unicode characters in pyspark

Question

I am trying to replace all the Unicode characters in a column value to its appropriate values.

My dataframe has columns where data will have character like "<REFERENCE ID=11458 TYPE=trademark/>"

i have a dictionary created for all the Unicode and used it inside a for loop

carat_dict = {"&lt;":'<', '&gt;':'>','&#60;':'<', '&#62;':'>'} 

for key,val in carat_dict.items():
         df = carat_repl.withColumn("SYNONYMS_ENGLISH_1", regexp_replace("SYNONYMS_ENGLISH_1",key,val))

this works but i have a bigger dictionary with over 100 key and value pairs and when i loop over it i get stackoverflow error, is there a better way to solve this?

Answer 1

Tried the implementation with reduce and regex sub , python regex explanation can be found here

Data Preparation

input_str = """
&lt;REFERENCE ID=11458 TYPE=trademark/&gt;,
&lt;REFERENCE ID=11458 TYPE=trademark/&#60;,
&#62;REFERENCE ID=11458 TYPE=trademark/&#60;,
&#62;REFERENCE ID=11458 TYPE=trademark/&#62;,
""".split(",")

input_values = list(map(lambda x: x.strip() if x.strip() != 'null' else None, input_str))

cols = list(map(lambda x: x.strip() if x.strip() != 'null' else None, "SYNONYMS_ENGLISH_1".split(",")))
            
n = len(input_values)
n_col = 1

input_list = [tuple(input_values[i:i+n_col]) for i in range(0,n,n_col)]

sparkDF = sql.createDataFrame(input_list, cols)

sparkDF.show(truncate=False)

+--------------------------------------------+
|SYNONYMS_ENGLISH_1                          |
+--------------------------------------------+
|&lt;REFERENCE ID=11458 TYPE=trademark/&gt;  |
|&lt;REFERENCE ID=11458 TYPE=trademark/&#60; |
|&#62;REFERENCE ID=11458 TYPE=trademark/&#60;|
|&#62;REFERENCE ID=11458 TYPE=trademark/&#62;|
+--------------------------------------------+

Reduce

@F.udf(StringType())
def replace_unicode(inp):
    
    carat_dict = {"&lt;":'<', '&gt;':'>','&#60;':'<', '&#62;':'>'}
    
    # the simplest, lambda-based implementation
    def multiple_replace(adict, text):
        
        # Create a regular expression from all of the dictionary keys
        regex = re.compile("|".join(map(re.escape, adict.keys(  ))))
        
        # For each match, look up the corresponding value in the dictionary
        return regex.sub(lambda match: adict[match.group(0)], text)

    return multiple_replace(carat_dict,inp)
    
sparkDF = reduce(
    lambda df, x: df.withColumn(x, replace_unicode(F.col(x))),
    ['SYNONYMS_ENGLISH_1'],
    sparkDF,
)
        
sparkDF.show(truncate=False)

+------------------------------------+
|SYNONYMS_ENGLISH_1                  |
+------------------------------------+
|<REFERENCE ID=11458 TYPE=trademark/>|
|<REFERENCE ID=11458 TYPE=trademark/<|
|>REFERENCE ID=11458 TYPE=trademark/<|
|>REFERENCE ID=11458 TYPE=trademark/>|
+------------------------------------+

Answer 2

Solved using the following way, still few html entities don't get parsed

import html

def basic_data(ip_df):
    df = ip_df

    df = df.select([col(c).cast("string") for c in df.columns])
    for df_col in df.columns:
        df = df.withColumn(df_col, html_encode(df_col))

    return df
 

@F.udf
def html_encode(Columntext):
    Columntext
    if Columntext is None:
        Columntext = ""
    return html.unescape(Columntext)

replace unicode characters in pyspark

Question

2 answers

solution1
0 2021-09-29 16:13:00

Data Preparation

Reduce

solution2
0 2021-10-05 16:47:36

replace unicode characters in pyspark

Question

2 answers

solution1 0 2021-09-29 16:13:00

Data Preparation

Reduce

solution2 0 2021-10-05 16:47:36

solution1
0 2021-09-29 16:13:00

solution2
0 2021-10-05 16:47:36