I have a data frame of ~17,000 lat
/ lon
values that I wish to use in order to obtain and populate a new column with the equivalent state.
So far, I have tried several solutions (far too many to list here, but more than ten) that were suggested in other Stack Overflow answers but none have worked for me.
The closest I have come to finding a solution is to use the ggmap
package, but the problem is that I am warned that I have exceeded the limit, despite only sending a single lat
/ lon
value to it.
I have individual lat
and lon
values and have even combined them into lat,lon
format too and despite this, none of the aforementioned solutions work for me.
What I want to do is determine the state from a given lat/lon
/ coord
value and save the state in a new column ( df$state
).
I initially matched all city values in order to obtain the matching state, but the problem there was that as several states contain cities with the same name, the matching process stopped after the first successful match; as a result, I found myself with over 2,800 cities belonging to AK despite them being literally several thousand miles away.
Any suggestions would be great.
Here are the first 100 rows of the coords
, lat
and lon
columns of my data:
structure(list(origin_coords = c("31.9618,-83.0588", "44.8782,-69.4718",
"37.3894,-121.8868", "36.0485,-93.5044", "37.652,-120.7292",
"33.7942,-84.2018", "32.0749,-81.0883", "31.0286,-97.6115", "40.7559,-111.8967",
"39.8359,-91.7538", "35.922,-80.537", "39.8036,-75.0058", "43.072,-83.8424",
"33.5207,-86.8025", "26.1216,-80.1288", "31.9618,-83.0588", "31.9618,-83.0588",
"61.6303,-149.8181", "33.8687,-84.3351", "42.2196,-88.2426",
"31.7943,-85.5581", "28.3067,-80.6862", "39.1157,-94.6271", "33.831,-85.7752",
"39.2655,-76.4935", "32.9824,-87.7919", "61.6303,-149.8181",
"31.086,-85.7192", "31.9618,-83.0588", "39.9048,-75.2946", "34.1132,-117.3771",
"41.905,-71.1026", "42.3921,-97.4751", "31.2627,-86.7711", "42.5864,-71.4401",
"33.7935,-93.807", "39.0097,-123.6523", "61.6303,-149.8181",
"37.7235,-85.9769", "38.0624,-87.2452", "37.7166,-121.9226",
"42.9993,-88.2196", "40.6316,-74.0927", "43.0892,-77.436", "39.8359,-91.7538",
"38.5487,-89.5413", "35.833,-90.6965", "41.363,-89.0008", "37.7953,-95.9368",
"33.4581,-83.0802", "33.7546,-93.6735", "32.7491,-96.4598", "41.8858,-87.6181",
"40.7328,-74.0755", "31.2627,-86.7711", "31.9618,-83.0588", "61.6303,-149.8181",
"38.4642,-85.7775", "40.6344,-92.9219", "37.8366,-89.1424", "42.5648,-83.0701",
"39.5394,-76.3564", "33.8687,-84.3351", "41.4564,-90.7235", "42.0122,-87.8417",
"38.8339,-104.8214", "36.4442,-92.5832", "39.838,-104.9988",
"41.8378,-87.7602", "28.3051,-81.4242", "41.6052,-71.9808", "40.7808,-80.0592",
"40.5364,-89.1885", "31.9618,-83.0588", "40.8915,-74.0119", "43.2078,-91.2976",
"34.4574,-83.476", "36.4105,-92.1951", "40.0177,-75.2594", "36.0557,-96.0602",
"44.694,-85.6763", "61.6303,-149.8181", "40.7446,-73.9345", "29.1989,-82.0874",
"26.6048,-80.2149", "34.6909,-118.1491", "39.0289,-95.2086",
"35.4074,-93.1355", "36.2523,-92.6907", "45.2097,-123.2043",
"37.7953,-95.9368", "61.6303,-149.8181", "39.1157,-94.6271",
"33.5793,-86.6375", "40.3757,-86.3201", "40.6344,-92.9219", "39.8359,-91.7538",
"42.3921,-97.4751", "41.2564,-73.2111", "44.2767,-121.1896"),
origin_lat = c(31.9618, 44.8782, 37.3894, 36.0485, 37.652,
33.7942, 32.0749, 31.0286, 40.7559, 39.8359, 35.922, 39.8036,
43.072, 33.5207, 26.1216, 31.9618, 31.9618, 61.6303, 33.8687,
42.2196, 31.7943, 28.3067, 39.1157, 33.831, 39.2655, 32.9824,
61.6303, 31.086, 31.9618, 39.9048, 34.1132, 41.905, 42.3921,
31.2627, 42.5864, 33.7935, 39.0097, 61.6303, 37.7235, 38.0624,
37.7166, 42.9993, 40.6316, 43.0892, 39.8359, 38.5487, 35.833,
41.363, 37.7953, 33.4581, 33.7546, 32.7491, 41.8858, 40.7328,
31.2627, 31.9618, 61.6303, 38.4642, 40.6344, 37.8366, 42.5648,
39.5394, 33.8687, 41.4564, 42.0122, 38.8339, 36.4442, 39.838,
41.8378, 28.3051, 41.6052, 40.7808, 40.5364, 31.9618, 40.8915,
43.2078, 34.4574, 36.4105, 40.0177, 36.0557, 44.694, 61.6303,
40.7446, 29.1989, 26.6048, 34.6909, 39.0289, 35.4074, 36.2523,
45.2097, 37.7953, 61.6303, 39.1157, 33.5793, 40.3757, 40.6344,
39.8359, 42.3921, 41.2564, 44.2767), origin_lon = c(-83.0588,
-69.4718, -121.8868, -93.5044, -120.7292, -84.2018, -81.0883,
-97.6115, -111.8967, -91.7538, -80.537, -75.0058, -83.8424,
-86.8025, -80.1288, -83.0588, -83.0588, -149.8181, -84.3351,
-88.2426, -85.5581, -80.6862, -94.6271, -85.7752, -76.4935,
-87.7919, -149.8181, -85.7192, -83.0588, -75.2946, -117.3771,
-71.1026, -97.4751, -86.7711, -71.4401, -93.807, -123.6523,
-149.8181, -85.9769, -87.2452, -121.9226, -88.2196, -74.0927,
-77.436, -91.7538, -89.5413, -90.6965, -89.0008, -95.9368,
-83.0802, -93.6735, -96.4598, -87.6181, -74.0755, -86.7711,
-83.0588, -149.8181, -85.7775, -92.9219, -89.1424, -83.0701,
-76.3564, -84.3351, -90.7235, -87.8417, -104.8214, -92.5832,
-104.9988, -87.7602, -81.4242, -71.9808, -80.0592, -89.1885,
-83.0588, -74.0119, -91.2976, -83.476, -92.1951, -75.2594,
-96.0602, -85.6763, -149.8181, -73.9345, -82.0874, -80.2149,
-118.1491, -95.2086, -93.1355, -92.6907, -123.2043, -95.9368,
-149.8181, -94.6271, -86.6375, -86.3201, -92.9219, -91.7538,
-97.4751, -73.2111, -121.1896)), row.names = c(NA, 100L), class = "data.frame")
Use function over
from sp
package:
library(geojsonio)
library(sp)
# get usa polygon data
# http://eric.clst.org/tech/usgeojson/
usa <- geojson_read(
"http://eric.clst.org/assets/wiki/uploads/Stuff/gz_2010_us_040_00_500k.json",
what = "sp"
)
df$state <- NA
# compare points
for (i in 1:nrow(df)) {
coords <- c(df$origin_lon[i], df$origin_lat[i])
if(any(is.na(coords))) next
point <- sp::SpatialPoints(
matrix(
coords,
nrow = 1
)
)
sp::proj4string(point) <- sp::proj4string(usa)
polygon_check <- sp::over(point, usa)
df$state[i] <- as.character(polygon_check$NAME)
}
> head(df)
origin_coords origin_lat origin_lon state
1 31.9618,-83.0588 31.9618 -83.0588 Georgia
2 44.8782,-69.4718 44.8782 -69.4718 Maine
3 37.3894,-121.8868 37.3894 -121.8868 California
4 36.0485,-93.5044 36.0485 -93.5044 Arkansas
5 37.652,-120.7292 37.6520 -120.7292 California
6 33.7942,-84.2018 33.7942 -84.2018 Georgia
Here is a sf solution using a spatial join st_join
of a spatial objects with US-states states_sf
(created from the USAboundaties package) , and a spatial object containing your data-points points_sf
.
Please verify the results, since I'm pretty new to spatial work in R.
Just filter the result data.frame for the columns you need.
library(sf)
library(USAboundaries)
states_sf <- st_transform( us_states( map_date = NULL, resolution = c("low", "high"), states = NULL), 4326)
points_sf = st_as_sf( points, coords = c("origin_lon", "origin_lat"), crs = 4326, agr = "constant")
result <- as.data.frame( st_join(points_sf, states_sf, join = st_intersects) )
# > head(result)
# origin_coords statefp statens affgeoid geoid stusps name lsad aland awater state_name state_abbr jurisdiction_type geometry
# 1 31.9618,-83.0588 13 01705317 0400000US13 13 GA Georgia 00 149169848456 4741100880 Georgia GA state POINT (-83.0588 31.9618)
# 2 44.8782,-69.4718 23 01779787 0400000US23 23 ME Maine 00 79885221885 11748755195 Maine ME state POINT (-69.4718 44.8782)
# 3 37.3894,-121.8868 06 01779778 0400000US06 06 CA California 00 403501101370 20466718403 California CA state POINT (-121.8868 37.3894)
# 4 36.0485,-93.5044 05 00068085 0400000US05 05 AR Arkansas 00 134771517596 2960191698 Arkansas AR state POINT (-93.5044 36.0485)
# 5 37.652,-120.7292 06 01779778 0400000US06 06 CA California 00 403501101370 20466718403 California CA state POINT (-120.7292 37.652)
# 6 33.7942,-84.2018 13 01705317 0400000US13 13 GA Georgia 00 149169848456 4741100880 Georgia GA state POINT (-84.2018 33.7942)
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.