I'm not sure where I'm going wrong here and why my data is returning wrong. Writing this code to use fuzzywuzzy to clean bad input road names against a list of correct names, replacing the incorrect with the closest match.
It's returning all lines of data2
back. I'm looking for it to return the same, or replaced lines of data1
back to me.
My Minimal, Reproducible Example:
import pandas as pd
import os
import csv
import usaddress
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
data1 =('3176 DETRIT ROAD')
data2 =('DETROIT RD')
try:
data1 = usaddress.tag(data1)
except usaddress.RepeatedLabelError:
pass
roaddnum2 = data1[0].get('AddressNumber', '')
roadir2 = data1[0].get('StreetNamePreDirectional', '')
roadname2 = data1[0].get('StreetName', '')
roaddsg2 = data1[0].get('StreetNamePostType', '')
street2 = (roadir2, roadname2, roaddsg2)
street2 = " ".join(street2)
street2 = street2.strip()
data2 = data2.split(',')
if street2 not in data2:
street2 = process.extract(street2, data2)
print(street2[0])
My full code
import pandas as pd
import os
import csv
import usaddress
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
def convert_tolist(string):
li = list(string.split(" "))
return li
with open(r"Cass_Howard - Copy.csv") as csv_file,\
open("Final_Test_Clean.csv", "w") as f,\
open(r"TEST_no_dups12.csv") as ul:
csv_reader = csv.reader(csv_file, delimiter=',')
next(csv_reader)
csv_reader = csv.reader(f, delimiter=',')
file_1 = csv_file
file_2 = ul
for data1, data2 in zip(file_1, file_2):
data1 = data1.split(',')
data1 = data1[18]
data1 = data1.upper()
data2 = data2.strip()
data2 = data2.split(',')
data2 = ''.join(data2)
try:
data1 = usaddress.tag(data1)
except usaddress.RepeatedLabelError:
pass
roaddnum2 = data1[0].get('AddressNumber', '')
roadir2 = data1[0].get('StreetNamePreDirectional', '')
roadname2 = data1[0].get('StreetName', '')
roaddsg2 = data1[0].get('StreetNamePostType', '')
street2 = (roadir2, roadname2, roaddsg2)
street2 = " ".join(street2)
street2 = street2.strip()
data1 = list(data1)
convert_tolist(data2)
if street2 not in data2:
street2 = process.extract(street2, data2)
print(street2)
street2
query data(around 950 lines)
DETROIT ROAD
DETROIT ROAD
MANNIX ST
MANNIX ST
data2
choices data(around 200 lines)
ACRES
ADERSON RD
AIRPORT RD
ALGONQUIN
Okay, I'm not certain I've fully understood your issue, but modifying your reprex, I have produced the following solution.
import usaddress
from fuzzywuzzy import process
data1 = "3176 DETRIT ROAD"
choices = ["DETROIT RD"]
try:
data1 = usaddress.tag(data1)
except usaddress.RepeatedLabelError:
pass
parts = [
data1[0].get("StreetNamePreDirectional"),
data1[0].get("StreetName"),
data1[0].get("StreetNamePostType"),
]
street = " ".join([x for x in parts if x])
if street not in choices:
street = " ".join(
[data1[0].get("AddressNumber"), process.extract(street, choices)[0][0]]
)
print(street)
This yields:
3176 DETROIT RD
So basically, it has replaces the street name bits with the best match from the choices list using the process.extract()
function.
And here it is as a callable function:
import usaddress
from fuzzywuzzy import process
def best_street(addr: str, choices: list[str]) -> str:
try:
usaddr = usaddress.tag(addr)
except usaddress.RepeatedLabelError:
pass
street_parts = [
usaddr[0].get("StreetNamePreDirectional"),
usaddr[0].get("StreetName"),
usaddr[0].get("StreetNamePostType"),
]
street = " ".join([x for x in street_parts if x])
return " ".join(
[usaddr[0].get("AddressNumber"), process.extract(street, choices)[0][0]]
)
if __name__ == "__main__":
choices = ["AIRPORT RD", "DETROIT RD"]
print(best_street("123 Detrt", choices))
print(best_street("9876 AIRPUMP DR", choices))
yielding:
123 DETROIT RD
9876 AIRPORT RD
My issue here was fuzzywuzzy requires you to pass an iterable so I had to add data2 = data2.split(',')
to get the full strings to return.
My other issue was thinking I needed to use zip()
to compare my files when zip()
is for comparing parallel and not cross products.
Here is what I came up with that works. There is no issue with the code but fuzzywuzzy is not accurate enough for this tool to be practical to clean my data with the amount of typos in my address data.
If you can think of a way for me to clean up the if/else statement i'd be willing to hear it.
import os
import csv
import usaddress
import pandas as pd
from fuzzywuzzy import process
with open(r"TEST_Cass_Howard.csv") as csv_file, \
open(".\Scratch\Final_Test_Clean.csv", "w") as f, \
open(r"TEST_Unique_List.csv") as ul:
csv_reader = csv.reader(csv_file, delimiter=',')
next(csv_reader)
csv_reader1 = csv.reader(f, delimiter=',')
correct = list(ul)
for line in csv_reader:
line = line[18]
line = line.upper()
if line == '' or line == ' ':
line = ''
else:
try:
addressbrk = usaddress.tag(line)
except usaddress.RepeatedLabelError:
addressbrk = line
line = addressbrk # alt output: ('Please fix the incorect format of: %s,' % addressbrk)
if line != '':
roadnum2 = line[0].get('AddressNumber', '')
roadir2 = line[0].get('StreetNamePreDirectional', '')
roadname2 = line[0].get('StreetName', '')
roaddsg2 = line[0].get('StreetNamePostType', '')
else:
line = ''
if line != '':
street2 = (roadir2, roadname2, roaddsg2)
street2 = " ".join(street2)
street2 = street2.strip()
else:
street2 = ''
if street2 != '':
if street2 not in correct:
street2 = process.extractOne(street2, correct)
else:
street2 = '\n'
if street2 != '':
tgthr = (roadnum2, street2[0])
else:
tgthr = ''
if tgthr != '':
final = (' '.join(tgthr))
else:
final = ('Null' + '\n')
f.writelines(final)
original = r"TEST_Cass_Howard.csv"
target = (r'.\Scratch\Cass_Howard_Clean.csv')
shutil.copyfile(original, target)
df1 = pd.read_csv('.\Scratch\Final_Test_Clean.csv', header=None)
df1.columns = ["WELL_ADDR_CLN"]
df = pd.read_csv('.\Scratch\Cass_Howard_Clean.csv')
df = df.join(df1)
new_data = df['WELL_ADDR_CLN']
df = df.drop(columns=['WELL_ADDR_CLN'])
df.insert(loc=19, column='WELL_ADDR_CLN', value=new_data)
os.remove(".\Scratch\Cass_Howard_Clean.csv")
df.to_csv("Cass_Howard_Clean.csv", index=False)
os.remove('.\Scratch\Final_Test_Clean.csv')
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.