简体   繁体   中英

Hierarchical Data List Data Wrangling in R or Python

I have some data that was given to me in a format that I'm trying to get into a flat file for a customer. It is hierarchical data but it doesn't have all the data filled in and you can't just do an easy fill because of so many different sub levels involved. The numbers are always 4 digits and signify something specific.

This is a report that can go out to dozens of subgroups with thousands of rows of data.

Here is a sample in R:

L1 <- c("Main1", rep(NA, 21), "Main2", "Main3")
L2 <- c(NA, "Sub2_1", rep(NA, 22))
L3 <- c(NA, NA, "Sub3_1", rep(NA, 17), "Sub3_2", rep(NA, 3))
L4 <- c(rep(NA, 3), "Sub4_1", rep(NA, 9), "Sub4_2", rep(NA, 7), "0015", rep(NA, 2))
L5 <- c(rep(NA, 4), "Sub5_1", NA, NA, "Sub5_2", NA, "Sub5_3", rep(NA, 4), "Sub5_5", rep(NA, 9))
L6 <- c(rep(NA, 5), "1111", "2885", NA, "0001", NA, "Sub6_1", rep(NA, 4), "Sub6_2", rep(NA, 8))
L7 <- c(rep(NA, 11), "Sub7_1", rep(NA, 4), "Sub7_2", rep(NA, 7))
L8 <- c(rep(NA, 12), "0011", rep(NA, 4), "9494", "Sub8_1", rep(NA, 5))
L9 <- c(rep(NA, 19), "8479", rep(NA, 4))

df <- data.frame(L1, L2, L3, L4, L5, L6, L7, L8, L9)

And I want an output like this since the four digit "code" is what we really need to look up:

code_f <- c("1111", "2885", "0001", "0011", "9494", "8479", "0015", NA, NA)
L1_f <- c(rep("Main1", 7), "Main2", "Main3")
L2_f <- c(rep("Sub2_1", 7), NA, NA)
L3_f <- c(rep("Sub3_1", 6), "Sub3_2", NA, NA)
L4_f <- c(rep("Sub4_1", 4), rep("Sub4_2", 2), rep(NA, 3))
L5_f <- c(rep("Sub5_1", 2), "Sub5_2", "Sub5_3", rep("Sub5_5", 2), rep(NA, 3))
L6_f <- c(rep(NA, 3), "Sub6_1", rep("Sub6_3", 2), rep(NA, 3))          
L7_f <- c(rep(NA, 3), "Sub7_1", rep("Sub7_2", 2), rep(NA, 3))
L8_f <- c(rep(NA, 5), "Sub8_1", rep(NA, 3))

df_f <- data.frame(code_f, L1_f, L2_f, L3_f, L4_f, L5_f, L6_f, L7_f, L8_f)

I do not see 0015 in your data, so cannot tell where it came from. But from what you provided, we can do:

mm = function(data,i=1){
  dat = tidyr::fill(data,!!!names(data)[i])%>%
    group_by(.dots=names(data)[1:i])
  if(i<ncol(dat)) mm(dat,i+1) else data
}

 df%>%
  mm%>%
  {b =lift(paste)(.); b[grepl("\\b\\d+\\b",b)| rowSums(is.na({.}[-1]))==ncol(df)-1]}%>%
  sub("(.*?)(\\b\\d+\\b|NA)","\\2 \\1",.)%>%
  read.table(text=.,fill=T,colClasses = "character")%>%
  mutate(r=rowSums(is.na(.))==ncol(.)-1)%>%
  group_by(V2)%>%
  filter(n()==1 & r|!r)%>%
  select(-r)%>%data.frame()
    V1    V2     V3     V4     V5     V6     V7     V8     V9
1 1111 Main1 Sub2_1 Sub3_1 Sub4_1 Sub5_1   <NA>   <NA>   <NA>
2 2885 Main1 Sub2_1 Sub3_1 Sub4_1 Sub5_1   <NA>   <NA>   <NA>
3 0001 Main1 Sub2_1 Sub3_1 Sub4_1 Sub5_2   <NA>   <NA>   <NA>
4 0011 Main1 Sub2_1 Sub3_1 Sub4_1 Sub5_3 Sub6_1 Sub7_1   <NA>
5 9494 Main1 Sub2_1 Sub3_1 Sub4_2 Sub5_5 Sub6_2 Sub7_2   <NA>
6 8479 Main1 Sub2_1 Sub3_1 Sub4_2 Sub5_5 Sub6_2 Sub7_2 Sub8_1
7   15 Main1 Sub2_1 Sub3_2   <NA>   <NA>   <NA>   <NA>   <NA>
8 <NA> Main2   <NA>   <NA>   <NA>   <NA>   <NA>   <NA>   <NA>
9 <NA> Main3   <NA>   <NA>   <NA>   <NA>   <NA>   <NA>   <NA>

The python version will be:

import pandas as pd
import numpy as np
import re

#df = pd.read_clipboard()
 #df[df=="<NA>"]=np.nan
#df['L1']=df['L1'].ffill()
def mmpy(data,m,i=0):
    data = data.copy(deep=True)
    data.iloc[:,i] = m[data.columns[i]].ffill()
    m = data.groupby(list(data.columns[0:i+1]))
    if i < len(data.columns)-1:  return mmpy(data,m,i+1)
    return data


s = mmpy(df,df.copy())
a = "\n".join([" ".join([str(k) for k in i.values()]) for i in s.T.to_dict().values()])
b = re.sub(r"^(.*?)(\b\d+\b|nan)",r"\2 \1",a,flags=re.M)

w = pd.DataFrame([i.split() for i in re.findall(r"^\d+.*$|.*Main\S* \\D*$",b,re.M)])

      0      1       2       3       4       5       6       7       8
0   nan  Main1     nan     nan     nan     nan     nan     nan     nan
1  1111  Main1  Sub2_1  Sub3_1  Sub4_1  Sub5_1     nan     nan     nan
2  2885  Main1  Sub2_1  Sub3_1  Sub4_1  Sub5_1     nan     nan     nan
3  0001  Main1  Sub2_1  Sub3_1  Sub4_1  Sub5_2     nan     nan     nan
4  0011  Main1  Sub2_1  Sub3_1  Sub4_1  Sub5_3  Sub6_1  Sub7_1     nan
5  9494  Main1  Sub2_1  Sub3_1  Sub4_2  Sub5_5  Sub6_2  Sub7_2     nan
6  8479  Main1  Sub2_1  Sub3_1  Sub4_2  Sub5_5  Sub6_2  Sub7_2  Sub8_1
7    15  Main1  Sub2_1  Sub3_2     nan     nan     nan     nan     nan
8   nan  Main2     nan     nan     nan     nan     nan     nan     nan
9   nan  Main3     nan     nan     nan     nan     nan     nan     nan

Not sure whether I got your question 100% right, but this seems to replicate your desired output (assuming that there is some additional data that is not present in df ; like noted in Onyambu's comment).

#change format of data
vec=c(t(as.matrix(df)))
subLevels=2:8
#regex patterns for 4-digit number and levels
patterns=c("[0-9]{4,4}","Main[0-9]{1,}",paste0("Sub",2:maxSub,"_[0-9]{1,}"))
#find indices for each level
idxList=lapply(patterns,grep,vec)
#replace all data that does not correspond to a given level by NA
valList=lapply(idxList,function(x) {tmp=vec;tmp[-x]=NA;tmp})

#the zoo library has a function to move missing values forward -> na.locf
library(zoo)
#for each 4-digit number and each level, find the respective level-string 
data.frame(code_f=na.omit(valList[[1]]),
           do.call("cbind",
                   lapply(valList[-1],
                          function(x) na.locf(x,na.rm=FALSE)[idxList[[1]]])))

#   code_f    X1     X2     X3     X4     X5     X6     X7     X8
# 1   1111 Main1 Sub2_1 Sub3_1 Sub4_1 Sub5_1   <NA>   <NA>   <NA>
# 2   2885 Main1 Sub2_1 Sub3_1 Sub4_1 Sub5_1   <NA>   <NA>   <NA>
# 3   0001 Main1 Sub2_1 Sub3_1 Sub4_1 Sub5_2   <NA>   <NA>   <NA>
# 4   0011 Main1 Sub2_1 Sub3_1 Sub4_1 Sub5_3 Sub6_1 Sub7_1   <NA>
# 5   9494 Main1 Sub2_1 Sub3_1 Sub4_2 Sub5_5 Sub6_2 Sub7_2   <NA>
# 6   8479 Main1 Sub2_1 Sub3_1 Sub4_2 Sub5_5 Sub6_2 Sub7_2 Sub8_1

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM