簡體   English   中英

拆分R中不同列中的值

[英]Splitting values in different columns in R

我的數據集中的一列包含如下值

utm_source=google&utm_medium=cpc&utm_campaign=1234567&utm_term=brand%20&utm_content=Brand&gclid=ERtyuiipotf_YTj

我應該如何將其拆分為不同的列及其在 R 中的值?

utm_source utm_medium  utm_campaign utm_brand  utm_content
  google      cpc          1234567   brand%20     Brand

dput(column)給出以下輸出

structure(list("null", "gclid=ertyyhglkdl-kjkY", 
    "utm_source=google&utm_medium=cpc&utm_campaign=1234556&utm_term=brand%20shirts&utm_content=Brand&gclid=jhajsgjdgd_ajs", 
    "utm_source=google&utm_medium=cpc&utm_campaign=1674814043&utm_term=brand%20shirts&utm_content=Brand&gclid=KvgMsEAAYASAAEgLq6vD_BwE", 
    "null", "null", "null", "null", "null", "null", "null", "null", 
    "null", "null", "utm_source=fb&utm_medium=ctw&utm_campaign=Shirt_rem&utm_content=CasciaShirt"), class = c("extracted", 
"list"))

使用 OP 的更新示例作為list ,我們遍歷listif元素不為"null" ,則創建一個tibble ,將列拆分為&separate_rows ,然后將該列拆分為多列( separate ),創建一個從命名向量( deframe )與as_tibble_row

library(dplyr)
library(tidyr)
library(tibble)
library(purrr)
map_dfr(lst1, ~ if(.x != "null") tibble(col1 = .x) %>% 
             separate_rows(col1, sep="&") %>% 
             separate(col1, into = c('col1', 'col2'), sep="\\=") %>%
             deframe %>% 
             as_tibble_row())

-輸出

# A tibble: 4 x 6
#  gclid                    utm_source utm_medium utm_campaign utm_term       utm_content
#  <chr>                    <chr>      <chr>      <chr>        <chr>          <chr>      
#1 ertyyhglkdl-kjkY         <NA>       <NA>       <NA>         <NA>           <NA>       
#2 jhajsgjdgd_ajs           google     cpc        1234556      brand%20shirts Brand      
#3 KvgMsEAAYASAAEgLq6vD_BwE google     cpc        1674814043   brand%20shirts Brand      
#4 <NA>                     fb         ctw        Shirt_rem    <NA>           CasciaShirt

或者不是在循環中執行此操作,我們可以將list轉換為data.frame的列,執行一次並轉換為寬格式

library(data.table)
keep(lst1, ~ .x != "null") %>%
     flatten_chr %>% 
     tibble(col1 = .) %>%
     mutate(rn = row_number()) %>% 
     separate_rows(col1, sep='&') %>% 
     separate(col1, into = c('col1', 'col2'), sep="\\=") %>%
     pivot_wider(names_from = col1, values_from = col2) %>% 
     select(-rn)
# A tibble: 4 x 6
#  gclid                    utm_source utm_medium utm_campaign utm_term       utm_content
#  <chr>                    <chr>      <chr>      <chr>        <chr>          <chr>      
#1 ertyyhglkdl-kjkY         <NA>       <NA>       <NA>         <NA>           <NA>       
#2 jhajsgjdgd_ajs           google     cpc        1234556      brand%20shirts Brand      
#3 KvgMsEAAYASAAEgLq6vD_BwE google     cpc        1674814043   brand%20shirts Brand      
#4 <NA>                     fb         ctw        Shirt_rem    <NA>           CasciaShirt

數據

lst1 <- structure(list("null", "gclid=ertyyhglkdl-kjkY", "utm_source=google&utm_medium=cpc&utm_campaign=1234556&utm_term=brand%20shirts&utm_content=Brand&gclid=jhajsgjdgd_ajs", 
    "utm_source=google&utm_medium=cpc&utm_campaign=1674814043&utm_term=brand%20shirts&utm_content=Brand&gclid=KvgMsEAAYASAAEgLq6vD_BwE", 
    "null", "null", "null", "null", "null", "null", "null", "null", 
    "null", "null", "utm_source=fb&utm_medium=ctw&utm_campaign=Shirt_rem&utm_content=CasciaShirt"), class = c("extracted", 
"list"))

我不確定這是否是預期的輸出。 以下可能是您目標的基本 R 選項

Reduce(
  function(...) merge(..., all = TRUE),
  lapply(
    column,
    function(x) {
      u <- unlist(strsplit(x, "&"))
      setNames(data.frame(as.list(gsub(".*=", "", u))), gsub("=.*", "", u))
    }
  )
)

這使

  utm_source utm_medium utm_campaign utm_content null                    gclid
1         fb        ctw    Shirt_rem CasciaShirt <NA>                     <NA>
2     google        cpc      1234556       Brand <NA>           jhajsgjdgd_ajs
3     google        cpc   1674814043       Brand <NA> KvgMsEAAYASAAEgLq6vD_BwE
4       <NA>       <NA>         <NA>        <NA> null         ertyyhglkdl-kjkY
        utm_term
1           <NA>
2 brand%20shirts
3 brand%20shirts
4           <NA>

更新

如果你想保留所有數據即使它是null ,你可以試試下面的代碼

Reduce(
  function(x, y) {
    if (all(is.na(x)) | all(is.na(y))) {
      return(rbind(x, y))
    }
    dplyr::full_join(x, y)
  },
  lapply(
    column,
    function(x) {
      if (x == "null") {
        return(NA)
      }
      u <- unlist(strsplit(x, "&"))
      setNames(data.frame(as.list(gsub(".*=", "", u))), gsub("=.*", "", u))
    }
  )
)

這使

                      gclid utm_source utm_medium utm_campaign       utm_term
1                      <NA>       <NA>       <NA>         <NA>           <NA>
2          ertyyhglkdl-kjkY       <NA>       <NA>         <NA>           <NA>
3            jhajsgjdgd_ajs     google        cpc      1234556 brand%20shirts
4  KvgMsEAAYASAAEgLq6vD_BwE     google        cpc   1674814043 brand%20shirts
5                      <NA>       <NA>       <NA>         <NA>           <NA>
6                      <NA>       <NA>       <NA>         <NA>           <NA>
7                      <NA>       <NA>       <NA>         <NA>           <NA>
8                      <NA>       <NA>       <NA>         <NA>           <NA>
9                      <NA>       <NA>       <NA>         <NA>           <NA>
10                     <NA>       <NA>       <NA>         <NA>           <NA>
11                     <NA>       <NA>       <NA>         <NA>           <NA>
12                     <NA>       <NA>       <NA>         <NA>           <NA>
13                     <NA>       <NA>       <NA>         <NA>           <NA>
14                     <NA>       <NA>       <NA>         <NA>           <NA>
15                     <NA>         fb        ctw    Shirt_rem           <NA>
   utm_content
1         <NA>
2         <NA>
3        Brand
4        Brand
5         <NA>
6         <NA>
7         <NA>
8         <NA>
9         <NA>
10        <NA>
11        <NA>
12        <NA>
13        <NA>
14        <NA>
15 CasciaShirt

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM