I'm trying to move away from iterows due to it's poor proformance. I can't however find another solution to comparing each row of one dataframe with each row from another dataframe.
I have two dataframes each containing a latitude and a longitude. Previously I have used these two functions to make a distance calculation between the two coordinates shown here:
def find_matches(first_HL, second_HL, N, M):
program_start = time.time()
matched_sites_df = pd.DataFrame()
for i_WP, r_WP in first_HL.iterrows():
series = pd.Series(dtype=float)
if r_WP['PL Name'] is not None and r_WP['PL Latitude'] is not None and r_WP['PL Longitude'] is not None:
series = name_and_distance_match(i_WP, r_WP, second_HL, N, M)
if series is not None:
series = pd.DataFrame(series.to_frame().T)
matched_sites_df = pd.concat([matched_sites_df, series], axis=0, ignore_index=True)
now = time.time()
print("------ MATCH FOUND ------ ", r_WP['PL Name'], "------", round(now - program_start, 2), "seconds")
return matched_sites_df
def calc_distance(r_WP, r_HL):
coords_1 = (r_WP['PL Latitude'], r_WP['PL Longitude'])
coords_2 = (r_HL['Latitude'], r_HL['Longitude'])
distance_km = round(geopy.distance.geodesic(coords_1, coords_2).km, 2)
return distance_km
def name_and_distance_match(i_WP, r_WP, second_HL, N, M):
for i_HL, r_HL in second_HL.iterrows():
if pd.isnull(r_HL['Site Name']) or pd.isnull(r_WP['PL Name']) == True:
pass
elif abs(r_WP['PL Latitude'] - r_HL['Latitude']) > 0.1:
pass
elif abs(r_WP['PL Longitude'] - r_HL['Longitude']) > 0.1:
pass
else:
distance_km = r_WP['Distance (km)'] = calc_distance(r_WP, r_HL)
if distance_km < M:
r_HL = filter_town(r_WP, r_HL)
score = r_WP['Name Similarity'] = np.vectorize(fuzzy)(r_HL["HL Site Short"], r_WP['PL Name'])
if score > N:
r_WP["HL Site Short"] = r_HL["HL Site Short"]
return r_WP
Is there a way I can do this without iterows? The solution I'm working on at the moment looks like this:
def distance_check(first_HL, second_WPHL):
first_lat = first_HL["Latitude"]
first_long = second_WPHL["PL Longitude"]
second_lat = first_HL["Latitude"]
second_long = second_WPHL["PL Longitude"]
if abs(first_lat - second_lat) + abs(first_long - second_long) > 0.2:
return False
else:
COMBINED_HOUSELIST["WHATPUB Site Name"] = PUBMATCH_WHATPUB_SITES["Site Name"]
return True
PUBMATCH_WHATPUB_SITES
COMBINED_HOUSELIST["Distance Check"] = COMBINED_HOUSELIST.apply(distance_check(PUBMATCH_WHATPUB_SITES, COMBINED_HOUSELIST), axis=1)
Any help would be greatly appreciated, thank you.
EDIT: Example Dataframes
COMBINED_HOUSELIST = pd.DataFrame(np.array([["12345", "Wrexham Cwtch", "52.10", "-2.06"], ["12354", "Horse & Hound", "52.21", "-1.95"], ["12435", "Round Of Gras Badsey", "52.33", "-1.99"]]),
columns=['Site Number', 'Site Name', 'Longitude', 'Latitude'])
PUBMATCH_WHATPUB_SITES= pd.DataFrame(np.array([["52938", "Valkyrie Café Bar", "53.22", "-3.00"], ["12435", "Round Of Badsey", "52.33", "-1.99"], ["12345", "Cwtch", "52.11", "-2.00"]]),
columns=['Site Number', 'Site Name', 'Longitude', 'Latitude'])
Desired output
matched_sites = pd.DataFrame(np.array([["12345", "Wrexham Cwtch", "52.10", "-2.06"], ["12354", "Horse & Hound", "52.21", "-1.95"], ["12435", "Round Of Gras Badsey", "52.33", "-1.99"]]),
columns=['Site Number', 'Site Name', 'Longitude', 'Latitude'])
One way or another, I fear that you will have to resort to some form of iteration, but doing it outside of Pandas might speed things up.
So, here is one way to do it with map and partial functions from Python standard library.
First, define two helper functions:
from functools import partial
def calc_distance(coo1, coo2):
return abs(coo1[0] - coo2[0]) + abs(coo1[1] - coo2[1])
def find_matches(one_list, another_list, threshold):
idx = []
for coo in one_list:
func = partial(calc_distance, coo)
results = [result for result in map(func, another_list)]
idx.append([results.index(result) for result in results if result <= threshold])
return idx
Then, with the following toy dataframes:
import pandas as pd
import numpy as np
COMBINED_HOUSELIST = pd.DataFrame(
np.array(
[
["12345", "Wrexham Cwtch", "52.10", "-2.06"],
["12354", "Horse & Hound", "52.21", "-1.95"],
["12435", "Round Of Gras Badsey", "52.33", "-1.99"],
]
),
columns=["Site Number", "Site Name", "Longitude", "Latitude"],
)
PUBMATCH_WHATPUB_SITES = pd.DataFrame(
np.array(
[
["52938", "Valkyrie Café Bar", "53.22", "-3.00"],
["54999", "New Café Bar", "52.10", "-2.1"],
["12435", "Round Of Badsey", "52.33", "-1.99"],
["12345", "Cwtch", "52.11", "-2.00"],
]
),
columns=["Site Number", "Site Name", "Longitude", "Latitude"],
)
You can proceed like this:
# Setup
for col in ["Latitude", "Longitude"]:
for df in [COMBINED_HOUSELIST, PUBMATCH_WHATPUB_SITES]:
df[col] = pd.to_numeric(df[col])
# Get two lists of coordinates looking like [[lat, long], [lat, long],...]
CH_COO = COMBINED_HOUSELIST.loc[:, ["Latitude", "Longitude"]].to_dict("split")["data"]
PW_COO = PUBMATCH_WHATPUB_SITES.loc[:, ["Latitude", "Longitude"]].to_dict("split")[
"data"
]
# Look for matches
COMBINED_HOUSELIST = COMBINED_HOUSELIST.assign(match=find_matches(CH_COO, PW_COO, 0.1))
# Get site names
COMBINED_HOUSELIST["match"] = COMBINED_HOUSELIST.apply(
lambda x: [PUBMATCH_WHATPUB_SITES.loc[idx, "Site Name"] for idx in x["match"]],
axis=1,
)
Finally, print(COMBINED_HOUSELIST)
:
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.