Removing all unicode u' from a csv file in python (2.7)

Question

    import os
    import json
    import csv
    import re


    subdir =  "./json_files/" #'/home/varun/Desktop/pyfile'

    def jsontocsv():
        with open ('test.csv', 'w') as outfile:
            fieldnames = ['name', 'private', 'version', 'dependencies', 'scripts', 'devDependencies']
            writer = csv.DictWriter(outfile, fieldnames=fieldnames)
            writer.writeheader()

        for file in os.listdir(subdir):
            file_path = os.path.join(subdir, file)

            with open(file_path, 'r') as json_file:
                parsed_json = json.load(json_file)

                with open ('test.csv', 'a') as csvfile:
                    writer = csv.writer(csvfile)
                    writer.writerow(parsed_json.values())


    def cleanUnicode():
        with open ('data.csv', 'w') as outfile:
            fieldnames = ['name', 'private', 'version', 'dependencies', 'scripts', 'devDependencies']
            writer = csv.DictWriter(outfile, fieldnames=fieldnames)
            writer.writeheader()

        with open('test.csv', 'r') as csvfile:
            reader = csv.DictReader(csvfile, delimiter=',')
            rows = list(reader)
            for row in rows[1:]:
                row = str(row)
                row = re.sub(r'u', r'', row)
                print(row)

                # with open ('data.csv', 'a') as csvfile:
                #     fieldnames = ['name', 'private', 'version', 'dependencies', 'scripts', 'devDependencies']
                #     writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                #     writer.writerow(row)

        # os.remove('test.csv')


    if __name__ == '__main__':
        jsontocsv()
        cleanUnicode()
        print("Scripts finished running all json files parsed to csv")

I am reading from multiple json files into a single csv file, getting the data in a single csv file but it has 'u for every nested values. How can I remove these and keep only the data I want?

Sample Input:

{
      "version": "0.1.0",
      "devDependencies": {
        "react-scripts": "0.6.1"
      },
      "dependencies": {
        "crossfilter": "^1.3.12",
        "d3": "^4.2.6",
        "d3-scale": "^1.0.3",
        "dc": "^2.0.0-beta.32",
        "immutable": "^3.8.1",
        "jszip": "^3.1.2",
        "react": "^15.3.2",
        "react-addons-transition-group": "^15.3.2",
        "react-dom": "^15.3.2",
        "shifty": "^1.5.2",
        "wolfy87-eventemitter": "^5.1.0"
      },
      "scripts": {
        "start": "react-scripts start",
        "build": "react-scripts build",
        "test": "react-scripts test --env=jsdom",
        "eject": "react-scripts eject"
      }
    }

Output:

version,dependencies,scripts,devDependencies
0.1.0,"{u'wolfy87-eventemitter': u'^5.1.0', u'shifty': u'^1.5.2', u'react-addons-transition-group': u'^15.3.2', u'react-dom': u'^15.3.2', u'dc': u'^2.0.0-beta.32', u'ccbooleananalysis': u'^1.0.0', u'react': u'^15.3.2', u'jszip': u'^3.1.2', u'crossfilter': u'^1.3.12', u'ccnetviz': u'^1.0.8', u'immutable': u'^3.8.1', u'd3': u'^4.2.6', u'd3-scale': u'^1.0.3'}","{u'test': u'react-scripts test --env=jsdom', u'start': u'react-scripts start', u'build': u'react-scripts build', u'eject': u'react-scripts eject'}",{u'react-scripts': u'0.6.1'}

Desired all u to be replaced

Answer 1

I'm not sure why you want to write dictionaries as strings into your CSV file, but anyway...

Here's one way to get strings without the u Unicode prefix. We process the dictionary that was created by loading the JSON data, encoding all key and value strings to UTF-8; any values that are dictionaries are processed recursively.

This works fine on pure ASCII data. However, any data that's outside the 7-bit ASCII range will be encoded as \\x escape sequences. That's not really a problem though. When you read the CSV file you will probably want to convert those strings back into proper dictionaries. You can use ast.literal_eval for that, and it will happily accept \\x escape sequences.

To verify that this code handles Unicode, I've added an extra item to your test data. The "devDependencies" dict now contains a new item: "unicode-test", which has a value of "™©". In the final section of my code I read the CSV data back in, convert the "devDependencies" string back into a dict, and print that dict's 'unicode-test' field to verify that it gets converted back into the correct Unicode string.

BTW, I mostly use Python 3.6 these days, and the most recent version of Python 2 I have is 2.6.6. Its csv module doesn't have a DictWriter.writeheader method, so I use an alternative way of writing the header row.

import json
import csv
import ast

csvname = 'test.csv' 

src = '''\
{
    "version": "0.1.0",
    "devDependencies": {
        "unicode-test": "™©",
        "react-scripts": "0.6.1"
    },
    "dependencies": {
        "crossfilter": "^1.3.12",
        "d3": "^4.2.6",
        "d3-scale": "^1.0.3",
        "dc": "^2.0.0-beta.32",
        "immutable": "^3.8.1",
        "jszip": "^3.1.2",
        "react": "^15.3.2",
        "react-addons-transition-group": "^15.3.2",
        "react-dom": "^15.3.2",
        "shifty": "^1.5.2",
        "wolfy87-eventemitter": "^5.1.0"
    },
    "scripts": {
        "start": "react-scripts start",
        "build": "react-scripts build",
        "test": "react-scripts test --env=jsdom",
        "eject": "react-scripts eject"
    }
}
'''

data = json.loads(src)

encoding = 'utf8'

def encode_dict(d):
    newd = {}
    for k, v in d.iteritems():
        if isinstance(v, dict):
            v = encode_dict(v)
        else:
            v = v.encode(encoding)
        newd[k.encode(encoding)] = v
    return newd

clean_data = encode_dict(data) 
print clean_data
print '- ' * 20

fieldnames = ['version', 'dependencies', 'scripts', 'devDependencies']

with open(csvname, 'wb') as outfile:
    writer = csv.DictWriter(outfile, fieldnames=fieldnames)
    #writer.writeheader()
    # Write header, the old-fashioned way
    writer.writerow(dict((s, s) for s in fieldnames))
    writer.writerow(clean_data)

# Verify
with open(csvname, 'rb') as infile:
    reader = csv.DictReader(infile)
    for row in reader:
        print row
        s = row['devDependencies']
        d = ast.literal_eval(s)
        print d['unicode-test']

output

{'devDependencies': {'unicode-test': '\xe2\x84\xa2\xc2\xa9', 'react-scripts': '0.6.1'}, 'version': '0.1.0', 'dependencies': {'wolfy87-eventemitter': '^5.1.0', 'react-addons-transition-group': '^15.3.2', 'react-dom': '^15.3.2', 'd3-scale': '^1.0.3', 'dc': '^2.0.0-beta.32', 'jszip': '^3.1.2', 'react': '^15.3.2', 'crossfilter': '^1.3.12', 'shifty': '^1.5.2', 'd3': '^4.2.6', 'immutable': '^3.8.1'}, 'scripts': {'test': 'react-scripts test --env=jsdom', 'start': 'react-scripts start', 'build': 'react-scripts build', 'eject': 'react-scripts eject'}}
- - - - - - - - - - - - - - - - - - - - 
{'devDependencies': "{'unicode-test': '\\xe2\\x84\\xa2\\xc2\\xa9', 'react-scripts': '0.6.1'}", 'version': '0.1.0', 'dependencies': "{'wolfy87-eventemitter': '^5.1.0', 'react-addons-transition-group': '^15.3.2', 'react-dom': '^15.3.2', 'd3-scale': '^1.0.3', 'dc': '^2.0.0-beta.32', 'jszip': '^3.1.2', 'react': '^15.3.2', 'crossfilter': '^1.3.12', 'shifty': '^1.5.2', 'd3': '^4.2.6', 'immutable': '^3.8.1'}", 'scripts': "{'test': 'react-scripts test --env=jsdom', 'start': 'react-scripts start', 'build': 'react-scripts build', 'eject': 'react-scripts eject'}"}
™©

contents of test.csv

version,dependencies,scripts,devDependencies
0.1.0,"{'wolfy87-eventemitter': '^5.1.0', 'react-addons-transition-group': '^15.3.2', 'react-dom': '^15.3.2', 'd3-scale': '^1.0.3', 'dc': '^2.0.0-beta.32', 'jszip': '^3.1.2', 'react': '^15.3.2', 'crossfilter': '^1.3.12', 'shifty': '^1.5.2', 'd3': '^4.2.6', 'immutable': '^3.8.1'}","{'test': 'react-scripts test --env=jsdom', 'start': 'react-scripts start', 'build': 'react-scripts build', 'eject': 'react-scripts eject'}","{'unicode-test': '\xe2\x84\xa2\xc2\xa9', 'react-scripts': '0.6.1'}"

Removing all unicode u' from a csv file in python (2.7)

Question

1 answers

solution1
0 2017-01-08 13:20:25

Removing all unicode u' from a csv file in python (2.7)

Question

1 answers

solution1 0 2017-01-08 13:20:25

solution1
0 2017-01-08 13:20:25