简体   繁体   中英

fuzzywuzzy returning single characters, not strings

I'm not sure where I'm going wrong here and why my data is returning wrong. Writing this code to use fuzzywuzzy to clean bad input road names against a list of correct names, replacing the incorrect with the closest match.

It's returning all lines of data2 back. I'm looking for it to return the same, or replaced lines of data1 back to me.

My Minimal, Reproducible Example:

import pandas as pd
import os
import csv
import usaddress
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

data1 =('3176 DETRIT ROAD')
data2 =('DETROIT RD')

try:
    data1 = usaddress.tag(data1)
except usaddress.RepeatedLabelError:
    pass

roaddnum2 = data1[0].get('AddressNumber', '')
roadir2 = data1[0].get('StreetNamePreDirectional', '')
roadname2 = data1[0].get('StreetName', '')
roaddsg2 = data1[0].get('StreetNamePostType', '')

street2 = (roadir2, roadname2, roaddsg2)
street2 = " ".join(street2)
street2 = street2.strip()
data2 = data2.split(',')

if street2 not in data2:
    street2 = process.extract(street2, data2)
    print(street2[0])

My full code

import pandas as pd
import os
import csv
import usaddress
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

def convert_tolist(string):
    li = list(string.split(" "))
    return li

with open(r"Cass_Howard - Copy.csv") as csv_file,\
        open("Final_Test_Clean.csv", "w") as f,\
        open(r"TEST_no_dups12.csv") as ul:
    csv_reader = csv.reader(csv_file, delimiter=',')
    next(csv_reader)
    csv_reader = csv.reader(f, delimiter=',')
    file_1 = csv_file
    file_2 = ul

    for data1, data2 in zip(file_1, file_2):
        data1 = data1.split(',')
        data1 = data1[18]
        data1 = data1.upper()
        data2 = data2.strip()
        data2 = data2.split(',')
        data2 = ''.join(data2)
        try:
            data1 = usaddress.tag(data1)
        except usaddress.RepeatedLabelError:
            pass

        roaddnum2 = data1[0].get('AddressNumber', '')
        roadir2 = data1[0].get('StreetNamePreDirectional', '')
        roadname2 = data1[0].get('StreetName', '')
        roaddsg2 = data1[0].get('StreetNamePostType', '')

        street2 = (roadir2, roadname2, roaddsg2)
        street2 = " ".join(street2)
        street2 = street2.strip()
        data1 = list(data1)
        convert_tolist(data2)


        if street2 not in data2:
            street2 = process.extract(street2, data2)
            print(street2)

street2 query data(around 950 lines)

DETROIT ROAD
DETROIT ROAD
MANNIX ST
MANNIX ST

data2 choices data(around 200 lines)

ACRES
ADERSON RD
AIRPORT RD
ALGONQUIN

Okay, I'm not certain I've fully understood your issue, but modifying your reprex, I have produced the following solution.

import usaddress
from fuzzywuzzy import process

data1 = "3176 DETRIT ROAD"
choices = ["DETROIT RD"]

try:
    data1 = usaddress.tag(data1)
except usaddress.RepeatedLabelError:
    pass

parts = [
    data1[0].get("StreetNamePreDirectional"),
    data1[0].get("StreetName"),
    data1[0].get("StreetNamePostType"),
]

street = " ".join([x for x in parts if x])

if street not in choices:
    street = " ".join(
        [data1[0].get("AddressNumber"), process.extract(street, choices)[0][0]]
    )

print(street)

This yields:

3176 DETROIT RD

So basically, it has replaces the street name bits with the best match from the choices list using the process.extract() function.


And here it is as a callable function:

import usaddress
from fuzzywuzzy import process


def best_street(addr: str, choices: list[str]) -> str:
    try:
        usaddr = usaddress.tag(addr)
    except usaddress.RepeatedLabelError:
        pass
    street_parts = [
        usaddr[0].get("StreetNamePreDirectional"),
        usaddr[0].get("StreetName"),
        usaddr[0].get("StreetNamePostType"),
    ]
    street = " ".join([x for x in street_parts if x])
    return " ".join(
        [usaddr[0].get("AddressNumber"), process.extract(street, choices)[0][0]]
    )


if __name__ == "__main__":
    choices = ["AIRPORT RD", "DETROIT RD"]
    print(best_street("123 Detrt", choices))
    print(best_street("9876 AIRPUMP DR", choices))

yielding:

123 DETROIT RD
9876 AIRPORT RD

My issue here was fuzzywuzzy requires you to pass an iterable so I had to add data2 = data2.split(',') to get the full strings to return.

My other issue was thinking I needed to use zip() to compare my files when zip() is for comparing parallel and not cross products.

Here is what I came up with that works. There is no issue with the code but fuzzywuzzy is not accurate enough for this tool to be practical to clean my data with the amount of typos in my address data.

If you can think of a way for me to clean up the if/else statement i'd be willing to hear it.

import os
import csv
import usaddress
import pandas as pd
from fuzzywuzzy import process

with open(r"TEST_Cass_Howard.csv") as csv_file, \
        open(".\Scratch\Final_Test_Clean.csv", "w") as f, \
        open(r"TEST_Unique_List.csv") as ul:
    csv_reader = csv.reader(csv_file, delimiter=',')
    next(csv_reader)
    csv_reader1 = csv.reader(f, delimiter=',')
    correct = list(ul)
    for line in csv_reader:
        line = line[18]
        line = line.upper()
        if line == '' or line == ' ':
            line = ''
        else:
            try:
                addressbrk = usaddress.tag(line)
            except usaddress.RepeatedLabelError:
                addressbrk = line
            line = addressbrk  # alt output: ('Please fix the incorect format of: %s,' % addressbrk)
        if line != '':
            roadnum2 = line[0].get('AddressNumber', '')
            roadir2 = line[0].get('StreetNamePreDirectional', '')
            roadname2 = line[0].get('StreetName', '')
            roaddsg2 = line[0].get('StreetNamePostType', '')
        else:
            line = ''
        if line != '':
            street2 = (roadir2, roadname2, roaddsg2)
            street2 = " ".join(street2)
            street2 = street2.strip()
        else:
            street2 = ''
        if street2 != '':
            if street2 not in correct:
                street2 = process.extractOne(street2, correct)
            else:
                street2 = '\n'
        if street2 != '':
            tgthr = (roadnum2, street2[0])
        else:
            tgthr = ''
        if tgthr != '':
            final = (' '.join(tgthr))
        else:
            final = ('Null' + '\n')
        f.writelines(final)
original = r"TEST_Cass_Howard.csv" 
target = (r'.\Scratch\Cass_Howard_Clean.csv')
shutil.copyfile(original, target)
df1 = pd.read_csv('.\Scratch\Final_Test_Clean.csv', header=None)
df1.columns = ["WELL_ADDR_CLN"]
df = pd.read_csv('.\Scratch\Cass_Howard_Clean.csv')
df = df.join(df1)
new_data = df['WELL_ADDR_CLN']
df = df.drop(columns=['WELL_ADDR_CLN'])
df.insert(loc=19, column='WELL_ADDR_CLN', value=new_data)
os.remove(".\Scratch\Cass_Howard_Clean.csv")
df.to_csv("Cass_Howard_Clean.csv", index=False)
os.remove('.\Scratch\Final_Test_Clean.csv')

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM