[英]Splitting values in different columns in R
我的數據集中的一列包含如下值
utm_source=google&utm_medium=cpc&utm_campaign=1234567&utm_term=brand%20&utm_content=Brand&gclid=ERtyuiipotf_YTj
我應該如何將其拆分為不同的列及其在 R 中的值?
utm_source utm_medium utm_campaign utm_brand utm_content
google cpc 1234567 brand%20 Brand
dput(column)
給出以下輸出
structure(list("null", "gclid=ertyyhglkdl-kjkY",
"utm_source=google&utm_medium=cpc&utm_campaign=1234556&utm_term=brand%20shirts&utm_content=Brand&gclid=jhajsgjdgd_ajs",
"utm_source=google&utm_medium=cpc&utm_campaign=1674814043&utm_term=brand%20shirts&utm_content=Brand&gclid=KvgMsEAAYASAAEgLq6vD_BwE",
"null", "null", "null", "null", "null", "null", "null", "null",
"null", "null", "utm_source=fb&utm_medium=ctw&utm_campaign=Shirt_rem&utm_content=CasciaShirt"), class = c("extracted",
"list"))
使用 OP 的更新示例作為list
,我們遍歷list
, if
元素不為"null"
,則創建一個tibble
,將列拆分為&
與separate_rows
,然后將該列拆分為多列( separate
),創建一個從命名向量( deframe
)與as_tibble_row
)
library(dplyr)
library(tidyr)
library(tibble)
library(purrr)
map_dfr(lst1, ~ if(.x != "null") tibble(col1 = .x) %>%
separate_rows(col1, sep="&") %>%
separate(col1, into = c('col1', 'col2'), sep="\\=") %>%
deframe %>%
as_tibble_row())
-輸出
# A tibble: 4 x 6
# gclid utm_source utm_medium utm_campaign utm_term utm_content
# <chr> <chr> <chr> <chr> <chr> <chr>
#1 ertyyhglkdl-kjkY <NA> <NA> <NA> <NA> <NA>
#2 jhajsgjdgd_ajs google cpc 1234556 brand%20shirts Brand
#3 KvgMsEAAYASAAEgLq6vD_BwE google cpc 1674814043 brand%20shirts Brand
#4 <NA> fb ctw Shirt_rem <NA> CasciaShirt
或者不是在循環中執行此操作,我們可以將list
轉換為data.frame
的列,執行一次並轉換為寬格式
library(data.table)
keep(lst1, ~ .x != "null") %>%
flatten_chr %>%
tibble(col1 = .) %>%
mutate(rn = row_number()) %>%
separate_rows(col1, sep='&') %>%
separate(col1, into = c('col1', 'col2'), sep="\\=") %>%
pivot_wider(names_from = col1, values_from = col2) %>%
select(-rn)
# A tibble: 4 x 6
# gclid utm_source utm_medium utm_campaign utm_term utm_content
# <chr> <chr> <chr> <chr> <chr> <chr>
#1 ertyyhglkdl-kjkY <NA> <NA> <NA> <NA> <NA>
#2 jhajsgjdgd_ajs google cpc 1234556 brand%20shirts Brand
#3 KvgMsEAAYASAAEgLq6vD_BwE google cpc 1674814043 brand%20shirts Brand
#4 <NA> fb ctw Shirt_rem <NA> CasciaShirt
lst1 <- structure(list("null", "gclid=ertyyhglkdl-kjkY", "utm_source=google&utm_medium=cpc&utm_campaign=1234556&utm_term=brand%20shirts&utm_content=Brand&gclid=jhajsgjdgd_ajs",
"utm_source=google&utm_medium=cpc&utm_campaign=1674814043&utm_term=brand%20shirts&utm_content=Brand&gclid=KvgMsEAAYASAAEgLq6vD_BwE",
"null", "null", "null", "null", "null", "null", "null", "null",
"null", "null", "utm_source=fb&utm_medium=ctw&utm_campaign=Shirt_rem&utm_content=CasciaShirt"), class = c("extracted",
"list"))
我不確定這是否是預期的輸出。 以下可能是您目標的基本 R 選項
Reduce(
function(...) merge(..., all = TRUE),
lapply(
column,
function(x) {
u <- unlist(strsplit(x, "&"))
setNames(data.frame(as.list(gsub(".*=", "", u))), gsub("=.*", "", u))
}
)
)
這使
utm_source utm_medium utm_campaign utm_content null gclid
1 fb ctw Shirt_rem CasciaShirt <NA> <NA>
2 google cpc 1234556 Brand <NA> jhajsgjdgd_ajs
3 google cpc 1674814043 Brand <NA> KvgMsEAAYASAAEgLq6vD_BwE
4 <NA> <NA> <NA> <NA> null ertyyhglkdl-kjkY
utm_term
1 <NA>
2 brand%20shirts
3 brand%20shirts
4 <NA>
更新
如果你想保留所有數據即使它是null
,你可以試試下面的代碼
Reduce(
function(x, y) {
if (all(is.na(x)) | all(is.na(y))) {
return(rbind(x, y))
}
dplyr::full_join(x, y)
},
lapply(
column,
function(x) {
if (x == "null") {
return(NA)
}
u <- unlist(strsplit(x, "&"))
setNames(data.frame(as.list(gsub(".*=", "", u))), gsub("=.*", "", u))
}
)
)
這使
gclid utm_source utm_medium utm_campaign utm_term
1 <NA> <NA> <NA> <NA> <NA>
2 ertyyhglkdl-kjkY <NA> <NA> <NA> <NA>
3 jhajsgjdgd_ajs google cpc 1234556 brand%20shirts
4 KvgMsEAAYASAAEgLq6vD_BwE google cpc 1674814043 brand%20shirts
5 <NA> <NA> <NA> <NA> <NA>
6 <NA> <NA> <NA> <NA> <NA>
7 <NA> <NA> <NA> <NA> <NA>
8 <NA> <NA> <NA> <NA> <NA>
9 <NA> <NA> <NA> <NA> <NA>
10 <NA> <NA> <NA> <NA> <NA>
11 <NA> <NA> <NA> <NA> <NA>
12 <NA> <NA> <NA> <NA> <NA>
13 <NA> <NA> <NA> <NA> <NA>
14 <NA> <NA> <NA> <NA> <NA>
15 <NA> fb ctw Shirt_rem <NA>
utm_content
1 <NA>
2 <NA>
3 Brand
4 Brand
5 <NA>
6 <NA>
7 <NA>
8 <NA>
9 <NA>
10 <NA>
11 <NA>
12 <NA>
13 <NA>
14 <NA>
15 CasciaShirt
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.