简体   繁体   English

将不等长的嵌套JSON转换为R中的数据帧

[英]Converting nested JSON of unequal lengths to data-frame in R

I have a JSON file 'data.json' which contains information about different places of interest. 我有一个JSON文件“ data.json”,其中包含有关不同景点的信息。

data = lapply(readLines("data.json"), fromJSON)

This creates a nested list having different lengths. 这将创建一个具有不同长度的嵌套列表。 Here's a sample of the first 4 lines. 这是前4行的示例。

list(structure(list(payload = structure(list(existence_full = 1L, 
    geo_virtual = "[\"56.9459720|-2.1971226|20|within_50m|4\"]", 
    latitude = "56.945972", locality = "Stonehaven", `_records_touched` = "{\"crawl\":8,\"lssi\":0,\"polygon_centroid\":0,\"geocoder\":0,\"user_submission\":0,\"tdc\":0,\"gov\":0}", 
    address = "The Lodge, Dunottar", email = "dunnottarcastle@btconnect.com", 
    existence_ml = 0.569423821765872, domain_aggregate = "", 
    name = "Dunnottar Castle", search_tags = c("Dunnottar Castle Aberdeenshire", 
    "Dunotter Castle"), admin_region = "Scotland", existence = 1L, 
    category_labels = structure(c("Landmarks", "Buildings and Structures"
    ), .Dim = 1:2), post_town = "Stonehaven", region = "Kincardineshire", 
    review_count = "719", geocode_level = "within_50m", tel = "01569 762173", 
    placerank = 65L, longitude = "-2.197123", placerank_ml = 37.2791607346447, 
    fax = "01330 860325", category_ids_text_search = "", website = "http://www.dunnottarcastle.co.uk", 
    status = "1", geocode_confidence = "20", postcode = "AB39 2TL", 
    category_ids = 108L, country = "gb", `_geocode_quality` = "4"), .Names = c("existence_full", 
"geo_virtual", "latitude", "locality", "_records_touched", "address", 
"email", "existence_ml", "domain_aggregate", "name", "search_tags", 
"admin_region", "existence", "category_labels", "post_town", 
"region", "review_count", "geocode_level", "tel", "placerank", 
"longitude", "placerank_ml", "fax", "category_ids_text_search", 
"website", "status", "geocode_confidence", "postcode", "category_ids", 
"country", "_geocode_quality")), uuid = "3867aaf3-12ab-434f-b12b-5d627b3359c3"), .Names = c("payload", 
"uuid")), structure(list(payload = structure(list(existence_full = 1L, 
    geo_virtual = "[\"56.237480|-5.073578|20|within_50m|4\"]", 
    latitude = "56.237480", locality = "Inveraray", `_records_touched` = "{\"crawl\":11,\"lssi\":0,\"polygon_centroid\":0,\"geocoder\":0,\"user_submission\":0,\"tdc\":0,\"gov\":0}", 
    address = "Cherry Park", email = "enquiries@inveraray-castle.com", 
    longitude = "-5.073578", domain_aggregate = "", name = "Inveraray Castle", 
    admin_region = "Scotland", search_tags = c("Inveraray Castle Tea Room", 
    "Inverary Castle"), existence = 1L, category_labels = structure(c("Social", 
    "Food and Dining", "Restaurants"), .Dim = c(1L, 3L)), region = "Argyll", 
    review_count = "532", geocode_level = "within_50m", tel = "01499 302203", 
    placerank = 67L, post_town = "Inveraray", placerank_ml = 41.1997808735227, 
    fax = "01499 302421", category_ids_text_search = "", website = "http://www.inveraray-castle.com", 
    status = "1", geocode_confidence = "20", postcode = "PA32 8XE", 
    category_ids = 347L, country = "gb", `_geocode_quality` = "4", 
    existence_ml = 0.791488110284778), .Names = c("existence_full", 
"geo_virtual", "latitude", "locality", "_records_touched", "address", 
"email", "longitude", "domain_aggregate", "name", "admin_region", 
"search_tags", "existence", "category_labels", "region", "review_count", 
"geocode_level", "tel", "placerank", "post_town", "placerank_ml", 
"fax", "category_ids_text_search", "website", "status", "geocode_confidence", 
"postcode", "category_ids", "country", "_geocode_quality", "existence_ml"
)), uuid = "8278ab80-2cd1-4dbd-9685-0d0036b681eb"), .Names = c("payload", 
"uuid")), structure(list(payload = structure(list(existence_full = 1L, 
    geo_virtual = "[\"51.483872|-0.606820|100|rooftop|2\"]", 
    latitude = "51.483872", locality = "Windsor Castle", hours_display = "Mon-Sat 11:30 AM-11:00 PM; Sun 12:00 PM-11:00 PM", 
    `_records_touched` = "{\"crawl\":7,\"lssi\":0,\"polygon_centroid\":0,\"geocoder\":2,\"user_submission\":0,\"tdc\":0,\"gov\":0}", 
    address = "", longitude = "-0.606820", domain_aggregate = "", 
    name = "Windsor Castle", admin_region = "England", search_tags = c("The Windsor Castle", 
    "The Windsor Castle Pub", "The Windsor Castle Public House", 
    "Pub Food", "British"), existence = 1L, category_labels = structure(c("Landmarks", 
    "Buildings and Structures"), .Dim = 1:2), region = "Berkshire", 
    review_count = "", geocode_level = "rooftop", tel = "020 7766 7304", 
    placerank = 62L, post_town = "Windsor", placerank_ml = 28.1160845346327, 
    fax = "01753 832290", category_ids_text_search = "", website = "http://www.royalcollection.org.uk/visit/windsorcastle", 
    status = "1", hours = "{\"monday\":[[\"11:30\",\"23:00\"]],\"tuesday\":[[\"11:30\",\"23:00\"]],\"wednesday\":[[\"11:30\",\"23:00\"]],\"thursday\":[[\"11:30\",\"23:00\"]],\"friday\":[[\"11:30\",\"23:00\"]],\"saturday\":[[\"11:30\",\"23:00\"]],\"sunday\":[[\"12:00\",\"23:00\"]]}", 
    neighborhood = "Chalvey", geocode_confidence = "100", postcode = "SL4 1NJ", 
    category_ids = 108L, country = "gb", `_geocode_quality` = "2", 
    existence_ml = 0.885705196944165, email = "bookinginfo@royalcollection.org.uk"), .Names = c("existence_full", 
"geo_virtual", "latitude", "locality", "hours_display", "_records_touched", 
"address", "longitude", "domain_aggregate", "name", "admin_region", 
"search_tags", "existence", "category_labels", "region", "review_count", 
"geocode_level", "tel", "placerank", "post_town", "placerank_ml", 
"fax", "category_ids_text_search", "website", "status", "hours", 
"neighborhood", "geocode_confidence", "postcode", "category_ids", 
"country", "_geocode_quality", "existence_ml", "email")), uuid = "c5f7d8a9-0851-46ef-8da7-ad55e187d3a8"), .Names = c("payload", 
"uuid")), structure(list(payload = structure(list(existence_full = 1L, 
    category_ids_text_search = "", placerank_ml = 31.9857184762157, 
    longitude = "-2.191955", name = "Pitmedden Garden", domain_aggregate = "", 
    admin_region = "Scotland", languages = "English", region = "Aberdeenshire", 
    review_count = "2", geocode_level = "rooftop", tel = "01651 842352", 
    placerank = 57L, post_town = "Ellon", category_labels = structure(c("Landmarks", 
    "Gardens"), .Dim = 1:2), existence = 1L, fax = "0844 493 2102", 
    website = "http://www.nts.org.uk/Property/Pitmedden-Garden", 
    status = "1", geocode_confidence = "100", postcode = "AB41 7PD", 
    country = "gb", category_ids = 109L, `_geocode_quality` = "4", 
    existence_ml = 0.849871115334588, email = "information@nts.org.uk", 
    address = "", `_records_touched` = "{\"crawl\":6,\"lssi\":0,\"polygon_centroid\":0,\"geocoder\":0,\"user_submission\":0,\"tdc\":0,\"gov\":0}", 
    locality = "Pitmedden", latitude = "57.343233", geo_virtual = "[\"57.343233|-2.191955|100|rooftop|4\"]"), .Names = c("existence_full", 
"category_ids_text_search", "placerank_ml", "longitude", "name", 
"domain_aggregate", "admin_region", "languages", "region", "review_count", 
"geocode_level", "tel", "placerank", "post_town", "category_labels", 
"existence", "fax", "website", "status", "geocode_confidence", 
"postcode", "country", "category_ids", "_geocode_quality", "existence_ml", 
"email", "address", "_records_touched", "locality", "latitude", 
"geo_virtual")), uuid = "bb57a153-740f-42be-aa4d-ae12d4eb57d4"), .Names = c("payload", 
"uuid")))

I want to convert this to a data-frame by populating values across different columns in the list of lists. 我想通过在列表列表中的不同列中填充值来将其转换为数据框。 Each list within the list contains information about a particular place, this is categorized by uuid . 列表中的每个列表都包含有关特定地点的信息,该信息由uuid分类。 So each row in the data-frame is going to contain information about a particular uuid . 因此,数据帧中的每一行都将包含有关特定uuid For columns that do not have corresponding values, NA should appear. 对于没有相应值的列,应显示NA。

I tried using some of the approaches mentioned in questions similar to this concept but was unsuccessful. 我尝试使用与该概念类似的问题中提到的某些方法,但未成功。

Any thoughts would be much appreciated! 任何想法将不胜感激! Thanks 谢谢

It might have helped to have had a more expansive description of the original data layout but here's a guess, based on what I see as the high level structure of that object. 对原始数据布局进行更广泛的描述可能会有所帮助,但是基于我认为该对象的高级结构,这是一个猜测。 Assume that structure is named dat .: 假设该structure名为dat

> lapply(dat, names)
[[1]]
[1] "payload" "uuid"   

[[2]]
[1] "payload" "uuid"   

[[3]]
[1] "payload" "uuid"   

[[4]]
[1] "payload" "uuid" 

So extract them to lists of dataframes 所以将它们提取到数据框列表中

payloads <- lapply(dat, function(x) data.frame(x$payload))
uuids <- lapply(dat, function(x) data.frame(x$uuid))

Then bind them "side by side" 然后将它们“并排”捆绑

newdat <- mapply( cbind, payloads, uuids)

Then look at the dimensions to see if the single line dataframes get properly copied over tot he multiline dataframes. 然后查看尺寸,以查看单行数据框是否正确复制到多行数据框上。 The one feature that is not to your specifications are the NA's. 不符合您要求的一项功能是NA。 Since the 'uuids' are apparently identifiers, the cbind operations will have copied each of their column contents to columns of the same lengths as the 'payloads': 由于“ uuid”显然是标识符,因此cbind操作会将其每个列内容复制到与“有效载荷”相同长度的列:

> lapply(payloads, dim)
[[1]]
[1]  2 32

[[2]]
[1]  2 33

[[3]]
[1]  5 35

[[4]]
[1]  1 32

> lapply(uuids, dim)
[[1]]
[1] 1 1

[[2]]
[1] 1 1

[[3]]
[1] 1 1

[[4]]
[1] 1 1

> lapply( mapply( cbind, payloads, uuids), dim)
[[1]]
[1]  2 33

[[2]]
[1]  2 34

[[3]]
[1]  5 36

[[4]]
[1]  1 33

The next level of consolidation might be to assemble all the dataframes "on top of each other since their names are so similar: 合并的下一个级别可能是将所有数据框“彼此叠加”,因为它们的名称是如此相似:

lapply( newdat, names)
[[1]]
 [1] "existence_full"           "geo_virtual"              "latitude"                
 [4] "locality"                 "X_records_touched"        "address"                 
 [7] "email"                    "existence_ml"             "domain_aggregate"        
[10] "name"                     "search_tags"              "admin_region"            
[13] "existence"                "category_labels.1"        "category_labels.2"       
[16] "post_town"                "region"                   "review_count"            
[19] "geocode_level"            "tel"                      "placerank"               
[22] "longitude"                "placerank_ml"             "fax"                     
[25] "category_ids_text_search" "website"                  "status"                  
[28] "geocode_confidence"       "postcode"                 "category_ids"            
[31] "country"                  "X_geocode_quality"        "x.uuid"                  

[[2]]
 [1] "existence_full"           "geo_virtual"              "latitude"                
 [4] "locality"                 "X_records_touched"        "address"                 
 [7] "email"                    "longitude"                "domain_aggregate"        
[10] "name"                     "admin_region"             "search_tags"             
[13] "existence"                "category_labels.1"        "category_labels.2"       
[16] "category_labels.3"        "region"                   "review_count"            
[19] "geocode_level"            "tel"                      "placerank"               
[22] "post_town"                "placerank_ml"             "fax"                     
[25] "category_ids_text_search" "website"                  "status"                  
[28] "geocode_confidence"       "postcode"                 "category_ids"            
[31] "country"                  "X_geocode_quality"        "existence_ml"            
[34] "x.uuid"                  

[[3]]
 [1] "existence_full"           "geo_virtual"              "latitude"                
 [4] "locality"                 "hours_display"            "X_records_touched"       
 [7] "address"                  "longitude"                "domain_aggregate"        
[10] "name"                     "admin_region"             "search_tags"             
[13] "existence"                "category_labels.1"        "category_labels.2"       
[16] "region"                   "review_count"             "geocode_level"           
[19] "tel"                      "placerank"                "post_town"               
[22] "placerank_ml"             "fax"                      "category_ids_text_search"
[25] "website"                  "status"                   "hours"                   
[28] "neighborhood"             "geocode_confidence"       "postcode"                
[31] "category_ids"             "country"                  "X_geocode_quality"       
[34] "existence_ml"             "email"                    "x.uuid"                  

[[4]]
 [1] "existence_full"           "category_ids_text_search" "placerank_ml"            
 [4] "longitude"                "name"                     "domain_aggregate"        
 [7] "admin_region"             "languages"                "region"                  
[10] "review_count"             "geocode_level"            "tel"                     
[13] "placerank"                "post_town"                "category_labels.1"       
[16] "category_labels.2"        "existence"                "fax"                     
[19] "website"                  "status"                   "geocode_confidence"      
[22] "postcode"                 "country"                  "category_ids"            
[25] "X_geocode_quality"        "existence_ml"             "email"                   
[28] "address"                  "X_records_touched"        "locality"                
[31] "latitude"                 "geo_virtual"              "x.uuid"    

The rbind.fill function in Hadley's plyr -package can do this efficiently: Hadley的plyrrbind.fill函数可以有效地做到这一点:

install.packages("plyr")
newdat3 <-  do.call(plyr::rbind.fill, newdat)
newdat3

So looking at a few of the columns, this appears to have satisfied your requirements: 因此,看几列,这似乎已经满足您的要求:

> newdat3[ , c("locality", "category_labels.3", "neighborhood")]
         locality category_labels.3 neighborhood
1      Stonehaven              <NA>         <NA>
2      Stonehaven              <NA>         <NA>
3       Inveraray       Restaurants         <NA>
4       Inveraray       Restaurants         <NA>
5  Windsor Castle              <NA>      Chalvey
6  Windsor Castle              <NA>      Chalvey
7  Windsor Castle              <NA>      Chalvey
8  Windsor Castle              <NA>      Chalvey
9  Windsor Castle              <NA>      Chalvey
10      Pitmedden              <NA>         <NA>

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM