I have some data that was given to me in a format that I'm trying to get into a flat file for a customer. It is hierarchical data but it doesn't have all the data filled in and you can't just do an easy fill because of so many different sub levels involved. The numbers are always 4 digits and signify something specific.
This is a report that can go out to dozens of subgroups with thousands of rows of data.
Here is a sample in R:
L1 <- c("Main1", rep(NA, 21), "Main2", "Main3")
L2 <- c(NA, "Sub2_1", rep(NA, 22))
L3 <- c(NA, NA, "Sub3_1", rep(NA, 17), "Sub3_2", rep(NA, 3))
L4 <- c(rep(NA, 3), "Sub4_1", rep(NA, 9), "Sub4_2", rep(NA, 7), "0015", rep(NA, 2))
L5 <- c(rep(NA, 4), "Sub5_1", NA, NA, "Sub5_2", NA, "Sub5_3", rep(NA, 4), "Sub5_5", rep(NA, 9))
L6 <- c(rep(NA, 5), "1111", "2885", NA, "0001", NA, "Sub6_1", rep(NA, 4), "Sub6_2", rep(NA, 8))
L7 <- c(rep(NA, 11), "Sub7_1", rep(NA, 4), "Sub7_2", rep(NA, 7))
L8 <- c(rep(NA, 12), "0011", rep(NA, 4), "9494", "Sub8_1", rep(NA, 5))
L9 <- c(rep(NA, 19), "8479", rep(NA, 4))
df <- data.frame(L1, L2, L3, L4, L5, L6, L7, L8, L9)
And I want an output like this since the four digit "code" is what we really need to look up:
code_f <- c("1111", "2885", "0001", "0011", "9494", "8479", "0015", NA, NA)
L1_f <- c(rep("Main1", 7), "Main2", "Main3")
L2_f <- c(rep("Sub2_1", 7), NA, NA)
L3_f <- c(rep("Sub3_1", 6), "Sub3_2", NA, NA)
L4_f <- c(rep("Sub4_1", 4), rep("Sub4_2", 2), rep(NA, 3))
L5_f <- c(rep("Sub5_1", 2), "Sub5_2", "Sub5_3", rep("Sub5_5", 2), rep(NA, 3))
L6_f <- c(rep(NA, 3), "Sub6_1", rep("Sub6_3", 2), rep(NA, 3))
L7_f <- c(rep(NA, 3), "Sub7_1", rep("Sub7_2", 2), rep(NA, 3))
L8_f <- c(rep(NA, 5), "Sub8_1", rep(NA, 3))
df_f <- data.frame(code_f, L1_f, L2_f, L3_f, L4_f, L5_f, L6_f, L7_f, L8_f)
I do not see 0015
in your data, so cannot tell where it came from. But from what you provided, we can do:
mm = function(data,i=1){
dat = tidyr::fill(data,!!!names(data)[i])%>%
group_by(.dots=names(data)[1:i])
if(i<ncol(dat)) mm(dat,i+1) else data
}
df%>%
mm%>%
{b =lift(paste)(.); b[grepl("\\b\\d+\\b",b)| rowSums(is.na({.}[-1]))==ncol(df)-1]}%>%
sub("(.*?)(\\b\\d+\\b|NA)","\\2 \\1",.)%>%
read.table(text=.,fill=T,colClasses = "character")%>%
mutate(r=rowSums(is.na(.))==ncol(.)-1)%>%
group_by(V2)%>%
filter(n()==1 & r|!r)%>%
select(-r)%>%data.frame()
V1 V2 V3 V4 V5 V6 V7 V8 V9
1 1111 Main1 Sub2_1 Sub3_1 Sub4_1 Sub5_1 <NA> <NA> <NA>
2 2885 Main1 Sub2_1 Sub3_1 Sub4_1 Sub5_1 <NA> <NA> <NA>
3 0001 Main1 Sub2_1 Sub3_1 Sub4_1 Sub5_2 <NA> <NA> <NA>
4 0011 Main1 Sub2_1 Sub3_1 Sub4_1 Sub5_3 Sub6_1 Sub7_1 <NA>
5 9494 Main1 Sub2_1 Sub3_1 Sub4_2 Sub5_5 Sub6_2 Sub7_2 <NA>
6 8479 Main1 Sub2_1 Sub3_1 Sub4_2 Sub5_5 Sub6_2 Sub7_2 Sub8_1
7 15 Main1 Sub2_1 Sub3_2 <NA> <NA> <NA> <NA> <NA>
8 <NA> Main2 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
9 <NA> Main3 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
import pandas as pd
import numpy as np
import re
#df = pd.read_clipboard()
#df[df=="<NA>"]=np.nan
#df['L1']=df['L1'].ffill()
def mmpy(data,m,i=0):
data = data.copy(deep=True)
data.iloc[:,i] = m[data.columns[i]].ffill()
m = data.groupby(list(data.columns[0:i+1]))
if i < len(data.columns)-1: return mmpy(data,m,i+1)
return data
s = mmpy(df,df.copy())
a = "\n".join([" ".join([str(k) for k in i.values()]) for i in s.T.to_dict().values()])
b = re.sub(r"^(.*?)(\b\d+\b|nan)",r"\2 \1",a,flags=re.M)
w = pd.DataFrame([i.split() for i in re.findall(r"^\d+.*$|.*Main\S* \\D*$",b,re.M)])
0 1 2 3 4 5 6 7 8
0 nan Main1 nan nan nan nan nan nan nan
1 1111 Main1 Sub2_1 Sub3_1 Sub4_1 Sub5_1 nan nan nan
2 2885 Main1 Sub2_1 Sub3_1 Sub4_1 Sub5_1 nan nan nan
3 0001 Main1 Sub2_1 Sub3_1 Sub4_1 Sub5_2 nan nan nan
4 0011 Main1 Sub2_1 Sub3_1 Sub4_1 Sub5_3 Sub6_1 Sub7_1 nan
5 9494 Main1 Sub2_1 Sub3_1 Sub4_2 Sub5_5 Sub6_2 Sub7_2 nan
6 8479 Main1 Sub2_1 Sub3_1 Sub4_2 Sub5_5 Sub6_2 Sub7_2 Sub8_1
7 15 Main1 Sub2_1 Sub3_2 nan nan nan nan nan
8 nan Main2 nan nan nan nan nan nan nan
9 nan Main3 nan nan nan nan nan nan nan
Not sure whether I got your question 100% right, but this seems to replicate your desired output (assuming that there is some additional data that is not present in df
; like noted in Onyambu's comment).
#change format of data
vec=c(t(as.matrix(df)))
subLevels=2:8
#regex patterns for 4-digit number and levels
patterns=c("[0-9]{4,4}","Main[0-9]{1,}",paste0("Sub",2:maxSub,"_[0-9]{1,}"))
#find indices for each level
idxList=lapply(patterns,grep,vec)
#replace all data that does not correspond to a given level by NA
valList=lapply(idxList,function(x) {tmp=vec;tmp[-x]=NA;tmp})
#the zoo library has a function to move missing values forward -> na.locf
library(zoo)
#for each 4-digit number and each level, find the respective level-string
data.frame(code_f=na.omit(valList[[1]]),
do.call("cbind",
lapply(valList[-1],
function(x) na.locf(x,na.rm=FALSE)[idxList[[1]]])))
# code_f X1 X2 X3 X4 X5 X6 X7 X8
# 1 1111 Main1 Sub2_1 Sub3_1 Sub4_1 Sub5_1 <NA> <NA> <NA>
# 2 2885 Main1 Sub2_1 Sub3_1 Sub4_1 Sub5_1 <NA> <NA> <NA>
# 3 0001 Main1 Sub2_1 Sub3_1 Sub4_1 Sub5_2 <NA> <NA> <NA>
# 4 0011 Main1 Sub2_1 Sub3_1 Sub4_1 Sub5_3 Sub6_1 Sub7_1 <NA>
# 5 9494 Main1 Sub2_1 Sub3_1 Sub4_2 Sub5_5 Sub6_2 Sub7_2 <NA>
# 6 8479 Main1 Sub2_1 Sub3_1 Sub4_2 Sub5_5 Sub6_2 Sub7_2 Sub8_1
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.