簡體   English   中英

使用 spark_apply 計算緯度/經度之間的距離

[英]Computing distances between lat/long using spark_apply

我嘗試使用spark_apply function 來計算 R 中某些經度和緯度坐標之間的一些距離。我可以在 base R 中計算它們,但我想使用spark_apply() function 應用相同的計算。

如何在spark_apply function 中復制distm(latLong, distanceFrom)計算?

數據:

library(data.table)
library(sparklyr)
library(geosphere)
library(tidyverse)

# setup
conf <- spark_config()
conf$spark.dynamicAllocation.enabled <- "true"
sc <- spark_connect(master = "local", version = "2.3.0")

# create data
df <- data_frame(
  place=c("Finland", "Canada", "Tanzania", "Bolivia", "France"),
  longitude=c(27.472918, -90.476303, 34.679950, -65.691146, 4.533465),
  latitude=c(63.293001, 54.239631, -2.855123, -13.795272, 48.603949),
  crs="+proj=longlat +datum=WGS84")

# compute distance from the "distanceFrom" data
latLong <- df %>% 
  dplyr::select(c(longitude, latitude))

distanceFrom <- rbind(c(34.20, -3.67), c(30.56, -2.50))

distm(latLong, distanceFrom)

######################### Apply this in Spark

mySpark <- sdf_copy_to(sc, df, "my_tbl", overwrite = TRUE)

由於sparklyr::spark_apply在一個火花 dataframe 上工作,一種策略是通過“crossjoin”將所有數據放到一個火花 dataframe 上。 然后,可以使用geodist::geodist計算距離。

library("data.table")
library("sparklyr")
#> 
#> Attaching package: 'sparklyr'
#> The following object is masked from 'package:stats':
#> 
#>     filter
library("geosphere")
library("tidyverse")

# setup
conf <- spark_config()
conf$spark.dynamicAllocation.enabled <- "true"
sc <- spark_connect(master = "local")

# create data
df <- data_frame(
  place=c("Finland", "Canada", "Tanzania", "Bolivia", "France"),
  longitude=c(27.472918, -90.476303, 34.679950, -65.691146, 4.533465),
  latitude=c(63.293001, 54.239631, -2.855123, -13.795272, 48.603949),
  crs="+proj=longlat +datum=WGS84")
#> Warning: `data_frame()` was deprecated in tibble 1.1.0.
#> Please use `tibble()` instead.

df
#> # A tibble: 5 x 4
#>   place    longitude latitude crs                       
#>   <chr>        <dbl>    <dbl> <chr>                     
#> 1 Finland      27.5     63.3  +proj=longlat +datum=WGS84
#> 2 Canada      -90.5     54.2  +proj=longlat +datum=WGS84
#> 3 Tanzania     34.7     -2.86 +proj=longlat +datum=WGS84
#> 4 Bolivia     -65.7    -13.8  +proj=longlat +datum=WGS84
#> 5 France        4.53    48.6  +proj=longlat +datum=WGS84

# compute distance from the "distanceFrom" data
latLong <- df %>% 
  dplyr::select(c(longitude, latitude))

distanceFrom <- rbind(c(34.20, -3.67), c(30.56, -2.50))

distm(latLong, distanceFrom)
#>            [,1]       [,2]
#> [1,]  7448355.4  7302060.8
#> [2,] 12520695.4 12197620.9
#> [3,]   104712.2   459812.3
#> [4,] 10987001.5 10626916.8
#> [5,]  6466454.9  6196687.9

# create df_1 from df (5 row dataframe)
df_1 = df %>% 
    select(longitude, latitude)

# create df_2 from 'distanceFrom' (2 row matrix)
df_2 = as_tibble(distanceFrom)
#> Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if `.name_repair` is omitted as of tibble 2.0.0.
#> Using compatibility `.name_repair`.
colnames(df_2) = c("longitude_2", "latitude_2")
df_2
#> # A tibble: 2 x 2
#>   longitude_2 latitude_2
#>         <dbl>      <dbl>
#> 1        34.2      -3.67
#> 2        30.6      -2.5

# copy both of them to spark
df_1_sdf = df_1 %>% 
    copy_to(sc, ., overwrite = TRUE)

df_1_sdf
#> # Source: spark<?> [?? x 2]
#>   longitude latitude
#>       <dbl>    <dbl>
#> 1     27.5     63.3 
#> 2    -90.5     54.2 
#> 3     34.7     -2.86
#> 4    -65.7    -13.8 
#> 5      4.53    48.6

df_2_sdf = df_2 %>% 
    copy_to(sc, ., overwrite = TRUE)

df_2_sdf
#> # Source: spark<?> [?? x 2]
#>   longitude_2 latitude_2
#>         <dbl>      <dbl>
#> 1        34.2      -3.67
#> 2        30.6      -2.5

# define distance function using geodist package
get_geodesic_distance = function(x){
    
    dist_vec = 
        geodist::geodist(dplyr::select(x, c(latitude, longitude))
                         , dplyr::select(x, c(latitude_2, longitude_2))
                         , paired = TRUE
                         , measure = "geodesic"
                         )
    res = dplyr::mutate(x, distance = dist_vec)
    res
}

# create all pairs of points
full_join(df_1_sdf, df_2_sdf, by = character(0)) %>% 
    glimpse() %>% 
    spark_apply(get_geodesic_distance)
#> Rows: ??
#> Columns: 4
#> Database: spark_connection
#> $ longitude   <dbl> 27.472918, 27.472918, -90.476303, -90.476303, 34.679950, 3…
#> $ latitude    <dbl> 63.293001, 63.293001, 54.239631, 54.239631, -2.855123, -2.…
#> $ longitude_2 <dbl> 34.20, 30.56, 34.20, 30.56, 34.20, 30.56, 34.20, 30.56, 34…
#> $ latitude_2  <dbl> -3.67, -2.50, -3.67, -2.50, -3.67, -2.50, -3.67, -2.50, -3…
#> # Source: spark<?> [?? x 5]
#>    longitude latitude longitude_2 latitude_2  distance
#>        <dbl>    <dbl>       <dbl>      <dbl>     <dbl>
#>  1     27.5     63.3         34.2      -3.67  7448355.
#>  2     27.5     63.3         30.6      -2.5   7302061.
#>  3    -90.5     54.2         34.2      -3.67 12520695.
#>  4    -90.5     54.2         30.6      -2.5  12197621.
#>  5     34.7     -2.86        34.2      -3.67   104712.
#>  6     34.7     -2.86        30.6      -2.5    459812.
#>  7    -65.7    -13.8         34.2      -3.67 10987002.
#>  8    -65.7    -13.8         30.6      -2.5  10626917.
#>  9      4.53    48.6         34.2      -3.67  6466455.
#> 10      4.53    48.6         30.6      -2.5   6196688.

PS:考慮geospark package 在 spark 上進行地理空間工作。

這是沒有 geodist 庫的 spark_apply function。

做與上面的答案相同的事情。 但是先定義下面的function。

... %>% 
spark_apply(get_geodesic_distance)

更改 function 中的代碼 geolocation(x$long1, x$lat1, x$long2, x$lat2) 以反映列名稱

get_geodesic_distance = function(x){
  
  geolocation = function(long1, lat1, long2, lat2){
    
    deg2rad <- function(deg) return(deg*pi/180)
    
    # Convert degrees to radians
    long1 <- deg2rad(long1)
    lat1 <- deg2rad(lat1)
    long2 <- deg2rad(long2)
    lat2 <- deg2rad(lat2)
    
    R = 6378137 #6371 Mean radius of the earth in km # 6378137 meters
    
    diff.long = (long2-long1)
    diff.lat = (lat2-lat1)
    
    a =(sin(diff.lat/2) * sin(diff.lat/2) + cos(lat1) * cos(lat2) * sin(diff.long/2)* sin(diff.long/2))
    c= 2*atan2(sqrt(a),sqrt(1-a))
    d = R*c
    
    return(d) #Distance in km
  }
  
  dist_vec = geolocation(x$long1, x$lat1, x$long2, x$lat2)
  
  res = dplyr::mutate(x, distance = dist_vec)
  res
}

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM