繁体   English   中英

Python - 地理坐标之间的距离矩阵

[英]Python - Distance matrix between geographic coordinates

我有一个拥有 600 多个地理坐标点的 dataframe 熊猫。 他的摘录如下:

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from math import sin, cos, sqrt, atan2, radians

lat_long = pd.DataFrame({'LATITUDE':[-22.98, -22.97, -22.92, -22.87, -22.89], 'LONGITUDE': [-43.19, -43.39, -43.24, -43.28, -43.67]})
lat_long

要手动计算两点之间的距离,我使用以下代码:

lat1 = radians(lat_long['LATITUDE'][0])
lon1 = radians(lat_long['LONGITUDE'][0])
lat2 = radians(lat_long['LATITUDE'][1])
lon2 = radians(lat_long['LONGITUDE'][1])

R = 6373.0

dlon = lon2 - lon1
dlat = lat2 - lat1

a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
c = 2 * atan2(sqrt(a), sqrt(1 - a))

distance = R * c

print("Result:", round(distance,4))

我需要做的是创建一个 function 使用上面的公式来计算从所有点到所有点的距离,就像在数组中一样。 但我很难考虑 function 做什么和存储点之间的距离。 欢迎任何帮助。 Output 示例(仅用于说明目的,如果我不清楚的话):

|       |point 0 | point1 | point2 |
|point0 |    0   |    2   |   3    |
|point1 |    2   |    0   |   4    |
|point2 |    3   |    4   |   0    |
        |distance|distance|distance|

您可以使用pdist计算成对距离:

import pandas as pd

import numpy as np
from math import sin, cos, sqrt, atan2, radians

from scipy.spatial.distance import pdist, squareform

lat_long = pd.DataFrame({'LATITUDE': [-22.98, -22.97, -22.92, -22.87, -22.89], 'LONGITUDE': [-43.19, -43.39, -43.24, -43.28, -43.67]})


def dist(x, y):
    """Function to compute the distance between two points x, y"""

    lat1 = radians(x[0])
    lon1 = radians(x[1])
    lat2 = radians(y[0])
    lon2 = radians(y[1])

    R = 6373.0

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c

    return round(distance, 4)


distances = pdist(lat_long.values, metric=dist)

points = [f'point_{i}' for i in range(1, len(lat_long) + 1)]

result = pd.DataFrame(squareform(distances), columns=points, index=points)

print(result)

Output

         point_1  point_2  point_3  point_4  point_5
point_1   0.0000  20.5115   8.4123  15.3203  50.1784
point_2  20.5115   0.0000  16.3400  15.8341  30.0319
point_3   8.4123  16.3400   0.0000   6.9086  44.1838
point_4  15.3203  15.8341   6.9086   0.0000  40.0284
point_5  50.1784  30.0319  44.1838  40.0284   0.0000

请注意, squareform从稀疏矩阵转换为密集矩阵,因此结果存储在 numpy 数组中。

另一种可能的解决方案是

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from math import sin, cos, sqrt, atan2, radians

lat_long = pd.DataFrame({'LATITUDE':[-22.98, -22.97, -22.92, -22.87, -22.89], 'LONGITUDE': [-43.19, -43.39, -43.24, -43.28, -43.67]})
lat_long

test = lat_long.iloc[2:,:]

def distance(city1, city2):
    lat1 = radians(city1['LATITUDE'])
    lon1 = radians(city1['LONGITUDE'])
    lat2 = radians(city2['LATITUDE'])
    lon2 = radians(city2['LONGITUDE'])

    R = 6373.0

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c

    return distance

dist = np.zeros([lat_long.shape[0],lat_long.shape[0]])
for i1, city1 in lat_long.iterrows():
    for i2, city2 in lat_long.iloc[i1+1:,:].iterrows():
        dist[i1,i2] = distance(city1, city2)

print(dist)

Output

[[ 0.         20.51149047  8.41230771 15.32026132 50.17836849]
 [ 0.          0.         16.33997119 15.83407186 30.03192954]
 [ 0.          0.          0.          6.90864606 44.18376436]
 [ 0.          0.          0.          0.         40.02842872]
 [ 0.          0.          0.          0.          0.        ]]

距离矩阵的下三角形是空的,因为矩阵是对称的( dist[i1,i2]==dist[i2,i1]

这似乎快了两倍:

# imports
import pandas as pd
import numpy as np

# supporting functions
def create_cartestin(df: pd.DataFrame):
    """
    This function returns cartesian of a dataframe with itself.

    df:
        dataframe to combine with itself

    """

    # create artifical id
    df['temp_id'] = [i for i in range(len(df))]

    # create cartesian merging key
    df['temp_key'] = 1
    df_cartesian = df.merge(df, on=['temp_key']).drop(columns=['temp_key'])

    return df_cartesian

def hav(theta):
    return np.sin(theta/2)**2

def give_me_straight_line_distance(
    df: pd.DataFrame,
    lattitude_x: str,
    longitude_x: str,
    lattitude_y: str,
    longitude_y: str
):
    """
    This function calculates distance between coordinates with haversine formula.

    df:
        dataframe containing cartesian product of tables containing points of interest
    lattitude_x:
        name of column containing lattitude of points x (1st set of points of interest)
    longitude_x:
        name of column containing longitude of points x (1st set of points of interest)
    lattitude_y:
        name of column containing lattitude of points y (2nd set of points of interest)
    longitude_y:
        name of column containing longitude of points y (2nd set of points of interest)

    """

    # assumed Earth radius
    r = 6371.009

    coords = df[[lattitude_x,longitude_x,lattitude_y,longitude_y]].values
    coordinates = np.deg2rad(coords)
    lat1 = coordinates[:, 0]
    lng1 = coordinates[:, 1]
    lat2 = coordinates[:, 2]
    lng2 = coordinates[:, 3]
    coslat1 = np.cos(lat1)
    coslat2 = np.cos(lat2)
    t = hav(lat2-lat1) + coslat1[:]*coslat2[:]*hav(lng2-lng1)
    d = 2*r*np.arcsin(np.sqrt(t))

    return d

# THE FUNCTION
def give_me_distance_matrix(df: pd.DataFrame):
    # create cartesian
    df_cartesian = create_cartestin(df)

    # calc distance for each pair of points
    df_cartesian['distance_km'] = \
    give_me_straight_line_distance(
        df = df_cartesian,
        lattitude_x = 'LATITUDE_x',
        longitude_x = 'LONGITUDE_x',
        lattitude_y = 'LATITUDE_y',
        longitude_y = 'LONGITUDE_y'
    )

    # turn into matrix format
    df_cartesian = df_cartesian.set_index(['temp_id_x','temp_id_y'])[['distance_km']].unstack(['temp_id_y'])

    # erasing artifical names
    df_cartesian = df_cartesian.reset_index(drop = True)
    df_cartesian = df_cartesian.T.reset_index(drop = True)

    return df_cartesian

lat_long = pd.DataFrame({'LATITUDE':[-22.98, -22.97, -22.92, -22.87, -22.89]*100, 'LONGITUDE': [-43.19, -43.39, -43.24, -43.28, -43.67]*100})
lat_long.shape
(500, 2)

%%timeit
result = give_me_distance_matrix(lat_long)

244 ms ± 33.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

比:

import pandas as pd
import numpy as np
from math import sin, cos, sqrt, atan2, radians
from scipy.spatial.distance import pdist, squareform

def dist(x, y):
    """Function to compute the distance between two points x, y"""

    lat1 = radians(x[0])
    lon1 = radians(x[1])
    lat2 = radians(y[0])
    lon2 = radians(y[1])

    R = 6373.0

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c

    return round(distance, 4)

%%timeit
distances = pdist(lat_long.values, metric=dist)
points = [f'point_{i}' for i in range(1, len(lat_long) + 1)]
result = pd.DataFrame(squareform(distances), columns=points, index=points)

563 ms ± 77.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM