简体   繁体   中英

Separating a long string with multiple delimiters into multiple columns

I have data formatted as such:

X    Raw data
1     %100,02231,      ,001,013, -00007,000,999 &IC  ,001,013 >vs     ,0652 ?2    ,2    ,00007 .vss    ,0655 ?2    ,2    ,00007 .mdb    ,0700 ?2    ,2    ,00007 .arn    ,0704 ?1    ,1
2     %100,02231,      ,001,023, -00008,000,999 &IC  ,001,023 >vs     ,0652 ?3    ,3    ,00008 .vss    ,0655 ?2    ,2    ,00008 .mdb    ,0700 ?2    ,2    ,00008 .arn    ,0704 ?1    ,1    ,00008 +gs     ,0713,0714 ?2    ,2    ,00008 .bzl    ,0719 ?2    ,2    ,00008 .krg    ,0724 ?1    ,1

Etc.

I want to transform this raw data into a nice table shape. I know how to separate certain things using the separate function of tidyr like so:

DFx <- separate(DF, Raw.data, into="Starting station", sep=">", extra="warn", fill = "right")

> for example indicates the starting station. & will indicate the train type. In the above example it will separate the starting station from the rest. I am looking for the best way to put this big dataset (only sampled a part of the lines) into a nice table. I am not afraid of a bit of manual labour, but just looking for any pointers that can put me in the right direction. Thanks.

The timetbls.dat file that I use can be downloaded from: here

Document about data format (unfortunately in Dutch, but maybe it can help some because you can still see the outline of the data), see pages 9/28-11/28: here

Bedankt voor de documentatie!

Let's get this large bit of code out of the way first (scroll past it for some commentary and notes on the list format):

# Reference: Section 5 of IFF Standaard
parse_iff_timetable <- function(path) {

  suppressPackageStartupMessages({
    require("stringi", quietly = TRUE, warn.conflicts = FALSE)
    require("tidyverse", quietly = TRUE, warn.conflicts = FALSE)
  })

  lines <- stri_read_lines(path.expand(path)) # read in all the lines

  starts <- which(grepl("^#", lines)) # find all the records
  ends <- c(starts[-1], length(lines))

  pb <- progress_estimated(length(starts)) # this took 3m on my system so progress bars might be handy

  map2(starts, ends, ~{

    pb$tick()$print()

    rec_num <- ""
    rec <- list(service = list(), stop = list())
    index <- 0

    for (l in lines[.x:.y]) { # iterate over the record

      if (stri_sub(l, 1, 1) == "#") { # (ritnummer)

        stri_sub(l, 1, 1) <- ""
        rec_num <-  l

      } else if (stri_sub(l, 1, 1) == "%") { # (vervoerder)

        stri_sub(l, 1, 1) <- ""
        tmp <- stri_trim_both(stri_split_fixed(l, ",")[[1]])
        set_names(
          tmp, c("company_number", "service_number", "variant", "first_stop",
                 "last_stop", "service_name")
        ) -> tmp

        rec$service <- append(rec$service, list(as.list(tmp)))

      } else if (stri_sub(l, 1, 1) == "-") { # (voetnoot)

        stri_sub(l, 1, 1) <- ""
        tmp <- stri_trim_both(stri_split_fixed(l, ",")[[1]])
        tmp <- set_names(tmp, c("footnote", "first_stop", "last_stop"))
        tmp <- as.list(tmp)

        rec$validity <- tmp

      } else if (stri_sub(l, 1, 1) == "&") { # (vervoerssort)

        stri_sub(l, 1, 1) <- ""
        tmp <- stri_trim_both(stri_split_fixed(l, ",")[[1]])
        tmp <- set_names(tmp, c("mode", "first_stop", "last_stop"))
        tmp <- as.list(tmp)

        rec$transport <- tmp

      } else if (stri_sub(l, 1, 1) == "*") { # (attribuut)

        stri_sub(l, 1, 1) <- ""
        tmp <- stri_trim_both(stri_split_fixed(l, ",")[[1]])
        tmp <- set_names(tmp, c("code", "first_stop", "last_stop", "unknown"))
        tmp <- as.list(tmp)

        rec$attribute <- tmp

      } else if (stri_sub(l, 1, 1) == ">") { # (begin van de rit)

        index <- index + 1

        stri_sub(l, 1, 1) <- ""
        tmp <- stri_trim_both(stri_split_fixed(l, ",")[[1]])
        tmp <- set_names(tmp, c("station_short", "departure_time"))
        tmp <- as.list(tmp)
        tmp$index <- index
        tmp$arrival_time <- NA_character_

        rec$stop <- list(tmp)

      } else if (stri_sub(l, 1, 1) == ".") { # (korte stop)

        index <- index + 1

        stri_sub(l, 1, 1) <- ""
        tmp <- stri_trim_both(stri_split_fixed(l, ",")[[1]])
        tmp <- set_names(tmp, c("station_short", "departure_time"))
        tmp <- as.list(tmp)
        tmp$index <- index
        tmp$arrival_time <- tmp$departure_time

        rec$stop <- append(rec$stop, list(tmp))

      } else if (stri_sub(l, 1, 1) == ";") { # (passeer station)

        index <- index + 1

        stri_sub(l, 1, 1) <- ""
        tmp <- stri_trim_both(stri_split_fixed(l, ",")[[1]])
        tmp <- set_names(tmp, c("station_short"))
        tmp <- as.list(tmp)
        tmp$index <- index
        tmp$arrival_time <- NA_character_
        tmp$departure_time <- NA_character_

        rec$stop <- append(rec$stop, list(tmp))

      } else if (stri_sub(l, 1, 1) == "+") { # (a/v stop)

        index <- index + 1

        stri_sub(l, 1, 1) <- ""
        tmp <- stri_trim_both(stri_split_fixed(l, ",")[[1]])
        tmp <- set_names(tmp, c("station_short", "arrival_time", "departure_time"))
        tmp <- as.list(tmp)
        tmp$index <- index

        rec$stop <- append(rec$stop, list(tmp))

      } else if (stri_sub(l, 1, 1) == "?") { # (spoor)

        stri_sub(l, 1, 1) <- ""
        tmp <- stri_trim_both(stri_split_fixed(l, ",")[[1]])
        tmp <- set_names(tmp, c("arrival_platform", "departure_platform", "footnote"))
        tmp <- as.list(tmp)
        tmp$index <- index

        if (stri_sub(tmp$arrival_platform, 1,1) != stri_sub(tmp$departure_platform, 1,1)) {
          message(
            sprintf(
              "\nNOTE: Difference in arrival/departure platforms: %s/%s (Record: #%s)",
              tmp$arrival_platform, tmp$departure_platform, rec_num
            )
          )
        }

        rec$platform <- tmp

      } else if (stri_sub(l, 1, 1) == "<") { # (eind van de rit)

        index <- index + 1

        stri_sub(l, 1, 1) <- ""
        tmp <- stri_trim_both(stri_split_fixed(l, ",")[[1]])
        tmp <- set_names(tmp, c("station_short", "arrival_time"))
        tmp <- as.list(tmp)
        tmp$index <- index
        tmp$departure_time <- NA_character_

        rec$stop <- append(rec$stop, list(tmp))

      }

    }

    rec

  })

}

I use stringi in ^^ as there is a high likelihood of this standard being used in many locales and stringi should take care of encoding normalization for us.

If I run that on the 13MB dat file:

ns_tbl <- parse_iff_timetable("~/data/ns-latest/timetbls.dat")

it takes ~3m (fast character-by-character ops is not one of R's strong points) and there's one cautionary note in there about one record having different arrival/departure platforms. An Rcpp-version of this would likely be much faster. Since order doesn't really matter, the furrr or pbapply packages may also be able to reduce time to <1m with just a minor change to the code.

The basic idiom is to go line-by line for each "record" and build up a large nested list structure (this is not "flat" data by any stretch of the imagination).

Let's walk through one record (the first one):

str(ns_tbl[1], 2)
## List of 1
##  $ :List of 5
##   ..$ service  :List of 2
##   ..$ stop     :List of 34
##   ..$ validity :List of 3
##   ..$ transport:List of 3
##   ..$ platform :List of 4

the stop element is kinda big, so let's look at the other ones first:

str(ns_tbl[[1]][-2], 3)
## List of 4
##  $ service  :List of 2
##   ..$ :List of 6
##   .. ..$ company_number: chr "100"
##   .. ..$ service_number: chr "11410"
##   .. ..$ variant       : chr ""
##   .. ..$ first_stop    : chr "001"
##   .. ..$ last_stop     : chr "002"
##   .. ..$ service_name  : chr "Nachtnettrein"
##   ..$ :List of 6
##   .. ..$ company_number: chr "100"
##   .. ..$ service_number: chr "01412"
##   .. ..$ variant       : chr ""
##   .. ..$ first_stop    : chr "002"
##   .. ..$ last_stop     : chr "008"
##   .. ..$ service_name  : chr "Nachtnettrein"
##  $ validity :List of 3
##   ..$ footnote  : chr "00002"
##   ..$ first_stop: chr "000"
##   ..$ last_stop : chr "999"
##  $ transport:List of 3
##   ..$ mode      : chr "IC"
##   ..$ first_stop: chr "001"
##   ..$ last_stop : chr "008"
##  $ platform :List of 4
##   ..$ arrival_platform  : chr "5"
##   ..$ departure_platform: chr "5"
##   ..$ footnote          : chr "00002"
##   ..$ index             : num 34

And, we can look at the first stop, second stop (no arrival/dest so i guess those aren't stops), one stop with arrival/departure, and the last stop:

str(ns_tbl[[1]]$stop[c(1, 2, 6, 34)], 2)
## List of 4
##  $ :List of 4
##   ..$ station_short : chr "rtd"
##   ..$ departure_time: chr "2532"
##   ..$ index         : num 1
##   ..$ arrival_time  : chr NA
##  $ :List of 4
##   ..$ station_short : chr "rtn"
##   ..$ index         : num 2
##   ..$ arrival_time  : chr NA
##   ..$ departure_time: chr NA
##  $ :List of 4
##   ..$ station_short : chr "gd"
##   ..$ arrival_time  : chr "2550"
##   ..$ departure_time: chr "2557"
##   ..$ index         : num 6
##  $ :List of 4
##   ..$ station_short : chr "ut"
##   ..$ arrival_time  : chr "2751"
##   ..$ index         : num 34
##   ..$ departure_time: chr NA

I'll gladly amend this with more info based on comments.

You can use standard R idioms for turning parts or all of this into a data frame:

map_df(ns_tbl, ~{
  as.list(c(
    unlist(.x$validity),
    unlist(.x$transport),
    unlist(.x$platform)
  )) -> out
  out$service <- list(.x$service)
  out$stop <- list(.x$stop)
  out
}) %>% 
  glimpse()
## Observations: 40,901
## Variables: 9
## $ footnote           <chr> "00002", "00003", "00004", "00005", ...
## $ first_stop         <chr> "001", "001", "001", "001", "001", "...
## $ last_stop          <chr> "008", "008", "007", "007", "007", "...
## $ mode               <chr> "IC", "IC", "IC", "IC", "IC", "IC", ...
## $ arrival_platform   <chr> "5", "5", "5", "5", "5", "5", "5", "...
## $ departure_platform <chr> "5", "5", "5", "5", "5", "5", "5", "...
## $ index              <chr> "34", "34", "34", "34", "34", "34", ...
## $ service            <list> [[["100", "11410", "", "001", "002"...
## $ stop               <list> [[["rtd", "2532", 1, NA], ["rtn", 2...

You still need to deal with unnesting the bits with multiple records.

Also, index at the top level is really just metadata for the # of stops, but I'll leave better naming up to you.

Ideally, one would parse the smaller metadata files and use the expanded versions of various abbreviated names.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM