[英]Python - Distance matrix between geographic coordinates
我有一個擁有 600 多個地理坐標點的 dataframe 熊貓。 他的摘錄如下:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from math import sin, cos, sqrt, atan2, radians
lat_long = pd.DataFrame({'LATITUDE':[-22.98, -22.97, -22.92, -22.87, -22.89], 'LONGITUDE': [-43.19, -43.39, -43.24, -43.28, -43.67]})
lat_long
要手動計算兩點之間的距離,我使用以下代碼:
lat1 = radians(lat_long['LATITUDE'][0])
lon1 = radians(lat_long['LONGITUDE'][0])
lat2 = radians(lat_long['LATITUDE'][1])
lon2 = radians(lat_long['LONGITUDE'][1])
R = 6373.0
dlon = lon2 - lon1
dlat = lat2 - lat1
a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
c = 2 * atan2(sqrt(a), sqrt(1 - a))
distance = R * c
print("Result:", round(distance,4))
我需要做的是創建一個 function 使用上面的公式來計算從所有點到所有點的距離,就像在數組中一樣。 但我很難考慮 function 做什么和存儲點之間的距離。 歡迎任何幫助。 Output 示例(僅用於說明目的,如果我不清楚的話):
| |point 0 | point1 | point2 |
|point0 | 0 | 2 | 3 |
|point1 | 2 | 0 | 4 |
|point2 | 3 | 4 | 0 |
|distance|distance|distance|
您可以使用pdist計算成對距離:
import pandas as pd
import numpy as np
from math import sin, cos, sqrt, atan2, radians
from scipy.spatial.distance import pdist, squareform
lat_long = pd.DataFrame({'LATITUDE': [-22.98, -22.97, -22.92, -22.87, -22.89], 'LONGITUDE': [-43.19, -43.39, -43.24, -43.28, -43.67]})
def dist(x, y):
"""Function to compute the distance between two points x, y"""
lat1 = radians(x[0])
lon1 = radians(x[1])
lat2 = radians(y[0])
lon2 = radians(y[1])
R = 6373.0
dlon = lon2 - lon1
dlat = lat2 - lat1
a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
c = 2 * atan2(sqrt(a), sqrt(1 - a))
distance = R * c
return round(distance, 4)
distances = pdist(lat_long.values, metric=dist)
points = [f'point_{i}' for i in range(1, len(lat_long) + 1)]
result = pd.DataFrame(squareform(distances), columns=points, index=points)
print(result)
Output
point_1 point_2 point_3 point_4 point_5
point_1 0.0000 20.5115 8.4123 15.3203 50.1784
point_2 20.5115 0.0000 16.3400 15.8341 30.0319
point_3 8.4123 16.3400 0.0000 6.9086 44.1838
point_4 15.3203 15.8341 6.9086 0.0000 40.0284
point_5 50.1784 30.0319 44.1838 40.0284 0.0000
請注意, squareform
從稀疏矩陣轉換為密集矩陣,因此結果存儲在 numpy 數組中。
另一種可能的解決方案是
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from math import sin, cos, sqrt, atan2, radians
lat_long = pd.DataFrame({'LATITUDE':[-22.98, -22.97, -22.92, -22.87, -22.89], 'LONGITUDE': [-43.19, -43.39, -43.24, -43.28, -43.67]})
lat_long
test = lat_long.iloc[2:,:]
def distance(city1, city2):
lat1 = radians(city1['LATITUDE'])
lon1 = radians(city1['LONGITUDE'])
lat2 = radians(city2['LATITUDE'])
lon2 = radians(city2['LONGITUDE'])
R = 6373.0
dlon = lon2 - lon1
dlat = lat2 - lat1
a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
c = 2 * atan2(sqrt(a), sqrt(1 - a))
distance = R * c
return distance
dist = np.zeros([lat_long.shape[0],lat_long.shape[0]])
for i1, city1 in lat_long.iterrows():
for i2, city2 in lat_long.iloc[i1+1:,:].iterrows():
dist[i1,i2] = distance(city1, city2)
print(dist)
Output
[[ 0. 20.51149047 8.41230771 15.32026132 50.17836849]
[ 0. 0. 16.33997119 15.83407186 30.03192954]
[ 0. 0. 0. 6.90864606 44.18376436]
[ 0. 0. 0. 0. 40.02842872]
[ 0. 0. 0. 0. 0. ]]
距離矩陣的下三角形是空的,因為矩陣是對稱的( dist[i1,i2]==dist[i2,i1]
)
這似乎快了兩倍:
# imports
import pandas as pd
import numpy as np
# supporting functions
def create_cartestin(df: pd.DataFrame):
"""
This function returns cartesian of a dataframe with itself.
df:
dataframe to combine with itself
"""
# create artifical id
df['temp_id'] = [i for i in range(len(df))]
# create cartesian merging key
df['temp_key'] = 1
df_cartesian = df.merge(df, on=['temp_key']).drop(columns=['temp_key'])
return df_cartesian
def hav(theta):
return np.sin(theta/2)**2
def give_me_straight_line_distance(
df: pd.DataFrame,
lattitude_x: str,
longitude_x: str,
lattitude_y: str,
longitude_y: str
):
"""
This function calculates distance between coordinates with haversine formula.
df:
dataframe containing cartesian product of tables containing points of interest
lattitude_x:
name of column containing lattitude of points x (1st set of points of interest)
longitude_x:
name of column containing longitude of points x (1st set of points of interest)
lattitude_y:
name of column containing lattitude of points y (2nd set of points of interest)
longitude_y:
name of column containing longitude of points y (2nd set of points of interest)
"""
# assumed Earth radius
r = 6371.009
coords = df[[lattitude_x,longitude_x,lattitude_y,longitude_y]].values
coordinates = np.deg2rad(coords)
lat1 = coordinates[:, 0]
lng1 = coordinates[:, 1]
lat2 = coordinates[:, 2]
lng2 = coordinates[:, 3]
coslat1 = np.cos(lat1)
coslat2 = np.cos(lat2)
t = hav(lat2-lat1) + coslat1[:]*coslat2[:]*hav(lng2-lng1)
d = 2*r*np.arcsin(np.sqrt(t))
return d
# THE FUNCTION
def give_me_distance_matrix(df: pd.DataFrame):
# create cartesian
df_cartesian = create_cartestin(df)
# calc distance for each pair of points
df_cartesian['distance_km'] = \
give_me_straight_line_distance(
df = df_cartesian,
lattitude_x = 'LATITUDE_x',
longitude_x = 'LONGITUDE_x',
lattitude_y = 'LATITUDE_y',
longitude_y = 'LONGITUDE_y'
)
# turn into matrix format
df_cartesian = df_cartesian.set_index(['temp_id_x','temp_id_y'])[['distance_km']].unstack(['temp_id_y'])
# erasing artifical names
df_cartesian = df_cartesian.reset_index(drop = True)
df_cartesian = df_cartesian.T.reset_index(drop = True)
return df_cartesian
lat_long = pd.DataFrame({'LATITUDE':[-22.98, -22.97, -22.92, -22.87, -22.89]*100, 'LONGITUDE': [-43.19, -43.39, -43.24, -43.28, -43.67]*100})
lat_long.shape
(500, 2)
%%timeit
result = give_me_distance_matrix(lat_long)
244 ms ± 33.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
比:
import pandas as pd
import numpy as np
from math import sin, cos, sqrt, atan2, radians
from scipy.spatial.distance import pdist, squareform
def dist(x, y):
"""Function to compute the distance between two points x, y"""
lat1 = radians(x[0])
lon1 = radians(x[1])
lat2 = radians(y[0])
lon2 = radians(y[1])
R = 6373.0
dlon = lon2 - lon1
dlat = lat2 - lat1
a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
c = 2 * atan2(sqrt(a), sqrt(1 - a))
distance = R * c
return round(distance, 4)
%%timeit
distances = pdist(lat_long.values, metric=dist)
points = [f'point_{i}' for i in range(1, len(lat_long) + 1)]
result = pd.DataFrame(squareform(distances), columns=points, index=points)
563 ms ± 77.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.