I have data formatted as such:
X Raw data
1 %100,02231, ,001,013, -00007,000,999 &IC ,001,013 >vs ,0652 ?2 ,2 ,00007 .vss ,0655 ?2 ,2 ,00007 .mdb ,0700 ?2 ,2 ,00007 .arn ,0704 ?1 ,1
2 %100,02231, ,001,023, -00008,000,999 &IC ,001,023 >vs ,0652 ?3 ,3 ,00008 .vss ,0655 ?2 ,2 ,00008 .mdb ,0700 ?2 ,2 ,00008 .arn ,0704 ?1 ,1 ,00008 +gs ,0713,0714 ?2 ,2 ,00008 .bzl ,0719 ?2 ,2 ,00008 .krg ,0724 ?1 ,1
Etc.
I want to transform this raw data into a nice table shape. I know how to separate certain things using the separate function of tidyr
like so:
DFx <- separate(DF, Raw.data, into="Starting station", sep=">", extra="warn", fill = "right")
>
for example indicates the starting station. &
will indicate the train type. In the above example it will separate the starting station from the rest. I am looking for the best way to put this big dataset (only sampled a part of the lines) into a nice table. I am not afraid of a bit of manual labour, but just looking for any pointers that can put me in the right direction. Thanks.
The timetbls.dat file that I use can be downloaded from: here
Document about data format (unfortunately in Dutch, but maybe it can help some because you can still see the outline of the data), see pages 9/28-11/28: here
Bedankt voor de documentatie!
Let's get this large bit of code out of the way first (scroll past it for some commentary and notes on the list format):
# Reference: Section 5 of IFF Standaard
parse_iff_timetable <- function(path) {
suppressPackageStartupMessages({
require("stringi", quietly = TRUE, warn.conflicts = FALSE)
require("tidyverse", quietly = TRUE, warn.conflicts = FALSE)
})
lines <- stri_read_lines(path.expand(path)) # read in all the lines
starts <- which(grepl("^#", lines)) # find all the records
ends <- c(starts[-1], length(lines))
pb <- progress_estimated(length(starts)) # this took 3m on my system so progress bars might be handy
map2(starts, ends, ~{
pb$tick()$print()
rec_num <- ""
rec <- list(service = list(), stop = list())
index <- 0
for (l in lines[.x:.y]) { # iterate over the record
if (stri_sub(l, 1, 1) == "#") { # (ritnummer)
stri_sub(l, 1, 1) <- ""
rec_num <- l
} else if (stri_sub(l, 1, 1) == "%") { # (vervoerder)
stri_sub(l, 1, 1) <- ""
tmp <- stri_trim_both(stri_split_fixed(l, ",")[[1]])
set_names(
tmp, c("company_number", "service_number", "variant", "first_stop",
"last_stop", "service_name")
) -> tmp
rec$service <- append(rec$service, list(as.list(tmp)))
} else if (stri_sub(l, 1, 1) == "-") { # (voetnoot)
stri_sub(l, 1, 1) <- ""
tmp <- stri_trim_both(stri_split_fixed(l, ",")[[1]])
tmp <- set_names(tmp, c("footnote", "first_stop", "last_stop"))
tmp <- as.list(tmp)
rec$validity <- tmp
} else if (stri_sub(l, 1, 1) == "&") { # (vervoerssort)
stri_sub(l, 1, 1) <- ""
tmp <- stri_trim_both(stri_split_fixed(l, ",")[[1]])
tmp <- set_names(tmp, c("mode", "first_stop", "last_stop"))
tmp <- as.list(tmp)
rec$transport <- tmp
} else if (stri_sub(l, 1, 1) == "*") { # (attribuut)
stri_sub(l, 1, 1) <- ""
tmp <- stri_trim_both(stri_split_fixed(l, ",")[[1]])
tmp <- set_names(tmp, c("code", "first_stop", "last_stop", "unknown"))
tmp <- as.list(tmp)
rec$attribute <- tmp
} else if (stri_sub(l, 1, 1) == ">") { # (begin van de rit)
index <- index + 1
stri_sub(l, 1, 1) <- ""
tmp <- stri_trim_both(stri_split_fixed(l, ",")[[1]])
tmp <- set_names(tmp, c("station_short", "departure_time"))
tmp <- as.list(tmp)
tmp$index <- index
tmp$arrival_time <- NA_character_
rec$stop <- list(tmp)
} else if (stri_sub(l, 1, 1) == ".") { # (korte stop)
index <- index + 1
stri_sub(l, 1, 1) <- ""
tmp <- stri_trim_both(stri_split_fixed(l, ",")[[1]])
tmp <- set_names(tmp, c("station_short", "departure_time"))
tmp <- as.list(tmp)
tmp$index <- index
tmp$arrival_time <- tmp$departure_time
rec$stop <- append(rec$stop, list(tmp))
} else if (stri_sub(l, 1, 1) == ";") { # (passeer station)
index <- index + 1
stri_sub(l, 1, 1) <- ""
tmp <- stri_trim_both(stri_split_fixed(l, ",")[[1]])
tmp <- set_names(tmp, c("station_short"))
tmp <- as.list(tmp)
tmp$index <- index
tmp$arrival_time <- NA_character_
tmp$departure_time <- NA_character_
rec$stop <- append(rec$stop, list(tmp))
} else if (stri_sub(l, 1, 1) == "+") { # (a/v stop)
index <- index + 1
stri_sub(l, 1, 1) <- ""
tmp <- stri_trim_both(stri_split_fixed(l, ",")[[1]])
tmp <- set_names(tmp, c("station_short", "arrival_time", "departure_time"))
tmp <- as.list(tmp)
tmp$index <- index
rec$stop <- append(rec$stop, list(tmp))
} else if (stri_sub(l, 1, 1) == "?") { # (spoor)
stri_sub(l, 1, 1) <- ""
tmp <- stri_trim_both(stri_split_fixed(l, ",")[[1]])
tmp <- set_names(tmp, c("arrival_platform", "departure_platform", "footnote"))
tmp <- as.list(tmp)
tmp$index <- index
if (stri_sub(tmp$arrival_platform, 1,1) != stri_sub(tmp$departure_platform, 1,1)) {
message(
sprintf(
"\nNOTE: Difference in arrival/departure platforms: %s/%s (Record: #%s)",
tmp$arrival_platform, tmp$departure_platform, rec_num
)
)
}
rec$platform <- tmp
} else if (stri_sub(l, 1, 1) == "<") { # (eind van de rit)
index <- index + 1
stri_sub(l, 1, 1) <- ""
tmp <- stri_trim_both(stri_split_fixed(l, ",")[[1]])
tmp <- set_names(tmp, c("station_short", "arrival_time"))
tmp <- as.list(tmp)
tmp$index <- index
tmp$departure_time <- NA_character_
rec$stop <- append(rec$stop, list(tmp))
}
}
rec
})
}
I use stringi
in ^^ as there is a high likelihood of this standard being used in many locales and stringi
should take care of encoding normalization for us.
If I run that on the 13MB dat file:
ns_tbl <- parse_iff_timetable("~/data/ns-latest/timetbls.dat")
it takes ~3m (fast character-by-character ops is not one of R's strong points) and there's one cautionary note in there about one record having different arrival/departure platforms. An Rcpp-version of this would likely be much faster. Since order doesn't really matter, the furrr
or pbapply
packages may also be able to reduce time to <1m with just a minor change to the code.
The basic idiom is to go line-by line for each "record" and build up a large nested list structure (this is not "flat" data by any stretch of the imagination).
Let's walk through one record (the first one):
str(ns_tbl[1], 2)
## List of 1
## $ :List of 5
## ..$ service :List of 2
## ..$ stop :List of 34
## ..$ validity :List of 3
## ..$ transport:List of 3
## ..$ platform :List of 4
the stop
element is kinda big, so let's look at the other ones first:
str(ns_tbl[[1]][-2], 3)
## List of 4
## $ service :List of 2
## ..$ :List of 6
## .. ..$ company_number: chr "100"
## .. ..$ service_number: chr "11410"
## .. ..$ variant : chr ""
## .. ..$ first_stop : chr "001"
## .. ..$ last_stop : chr "002"
## .. ..$ service_name : chr "Nachtnettrein"
## ..$ :List of 6
## .. ..$ company_number: chr "100"
## .. ..$ service_number: chr "01412"
## .. ..$ variant : chr ""
## .. ..$ first_stop : chr "002"
## .. ..$ last_stop : chr "008"
## .. ..$ service_name : chr "Nachtnettrein"
## $ validity :List of 3
## ..$ footnote : chr "00002"
## ..$ first_stop: chr "000"
## ..$ last_stop : chr "999"
## $ transport:List of 3
## ..$ mode : chr "IC"
## ..$ first_stop: chr "001"
## ..$ last_stop : chr "008"
## $ platform :List of 4
## ..$ arrival_platform : chr "5"
## ..$ departure_platform: chr "5"
## ..$ footnote : chr "00002"
## ..$ index : num 34
And, we can look at the first stop, second stop (no arrival/dest so i guess those aren't stops), one stop with arrival/departure, and the last stop:
str(ns_tbl[[1]]$stop[c(1, 2, 6, 34)], 2)
## List of 4
## $ :List of 4
## ..$ station_short : chr "rtd"
## ..$ departure_time: chr "2532"
## ..$ index : num 1
## ..$ arrival_time : chr NA
## $ :List of 4
## ..$ station_short : chr "rtn"
## ..$ index : num 2
## ..$ arrival_time : chr NA
## ..$ departure_time: chr NA
## $ :List of 4
## ..$ station_short : chr "gd"
## ..$ arrival_time : chr "2550"
## ..$ departure_time: chr "2557"
## ..$ index : num 6
## $ :List of 4
## ..$ station_short : chr "ut"
## ..$ arrival_time : chr "2751"
## ..$ index : num 34
## ..$ departure_time: chr NA
I'll gladly amend this with more info based on comments.
You can use standard R idioms for turning parts or all of this into a data frame:
map_df(ns_tbl, ~{
as.list(c(
unlist(.x$validity),
unlist(.x$transport),
unlist(.x$platform)
)) -> out
out$service <- list(.x$service)
out$stop <- list(.x$stop)
out
}) %>%
glimpse()
## Observations: 40,901
## Variables: 9
## $ footnote <chr> "00002", "00003", "00004", "00005", ...
## $ first_stop <chr> "001", "001", "001", "001", "001", "...
## $ last_stop <chr> "008", "008", "007", "007", "007", "...
## $ mode <chr> "IC", "IC", "IC", "IC", "IC", "IC", ...
## $ arrival_platform <chr> "5", "5", "5", "5", "5", "5", "5", "...
## $ departure_platform <chr> "5", "5", "5", "5", "5", "5", "5", "...
## $ index <chr> "34", "34", "34", "34", "34", "34", ...
## $ service <list> [[["100", "11410", "", "001", "002"...
## $ stop <list> [[["rtd", "2532", 1, NA], ["rtn", 2...
You still need to deal with unnesting the bits with multiple records.
Also, index
at the top level is really just metadata for the # of stops, but I'll leave better naming up to you.
Ideally, one would parse the smaller metadata files and use the expanded versions of various abbreviated names.
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.