繁体   English   中英

根据同一数据帧的另一列中的字符将文本列添加到数据帧

[英]Add a text column to the dataframe based on characters in another column of the same dataframe

以下是我的数据框的一个小样本:

> print(df1)
        Section                     A                      B                      
1   ./100-12-S00.xlsx           0.6032591              0.4554364
2   ./108-15-S01.xlsx           0.7109148              0.5367121
3   ./92-12-S00.xlsx            0.6032591              0.4554364
4   ./124-9-S05.xlsx            0.7109148              0.5367121
5   ./116-15-S11.xlsx           0.6032591              0.4554364
6   ./108-9-S06.xlsx            0.7109148              0.5367121
7   ./84-12-S08.xlsx            0.6032591              0.4554364
8   ./124-15-S11.xlsx           0.7109148              0.5367121
9   ./92-12-S00.xlsx            0.6032591              0.4554364
10  ./116-15-S03.xlsx           0.7109148              0.5367121
11  ./100-12-S02.xlsx           0.6032591              0.4554364
12  ./84-9-S05.xlsx             0.7109148              0.5367121

我想根据 Section 列中的字符向 df1 添加一列新文本。 必须这样做,如果部分包括:

> print(df2)
     Section    Names
1    S00        Baseline
2    S01        Sample1
3    S02        Sample2
4    S03        Sample3
5    S04        Sample4
6    S05        Sample5
7    S06        Sample6
8    S07        Sample7
9    S08        Sample8
10   S09        Sample9
11   S10        Sample10
12   S11        AASHTO

我希望 df1 根据 Section 列中的内容包含一个名为 Names 的新列。

预期的输出是:

> print(df3)
        Section                 A              B          Names                  
1   ./100-12-S00.xlsx       0.6032591      0.4554364    Baseline
2   ./108-15-S01.xlsx       0.7109148      0.5367121    Sample1
3   ./92-12-S00.xlsx        0.6032591      0.4554364    Baseline
4   ./124-9-S05.xlsx        0.7109148      0.5367121    Sample5
5   ./116-15-S11.xlsx       0.6032591      0.4554364    AASHTO
6   ./108-9-S06.xlsx        0.7109148      0.5367121    Sample6
7   ./84-12-S08.xlsx        0.6032591      0.4554364    Sample8
8   ./124-15-S11.xlsx       0.7109148      0.5367121    AASHTO
9   ./92-12-S00.xlsx        0.6032591      0.4554364    Baseline
10  ./116-15-S03.xlsx       0.7109148      0.5367121    Sample3
11  ./100-12-S02.xlsx       0.6032591      0.4554364    Sample2
12  ./84-9-S05.xlsx         0.7109148      0.5367121    Sample5

我们可以通过提取“df1”中的子字符串来创建一个新列,并与“df2”进行left连接

library(dplyr)
library(stringr)
df1 %>%
   mutate(Section2 = str_extract(Section, "S\\d+")) %>%
   left_join(df2, by = c("Section2" = "Section")) %>%
   select(-Section2)
#         Section         A         B    Names
#1  ./100-12-S00.xlsx 0.6032591 0.4554364 Baseline
#2  ./108-15-S01.xlsx 0.7109148 0.5367121  Sample1
#3   ./92-12-S00.xlsx 0.6032591 0.4554364 Baseline
#4   ./124-9-S05.xlsx 0.7109148 0.5367121  Sample5
#5  ./116-15-S11.xlsx 0.6032591 0.4554364   AASHTO
#6   ./108-9-S06.xlsx 0.7109148 0.5367121  Sample6
#7   ./84-12-S08.xlsx 0.6032591 0.4554364  Sample8
#8  ./124-15-S11.xlsx 0.7109148 0.5367121   AASHTO
#9   ./92-12-S00.xlsx 0.6032591 0.4554364 Baseline
#10 ./116-15-S03.xlsx 0.7109148 0.5367121  Sample3
#11 ./100-12-S02.xlsx 0.6032591 0.4554364  Sample2
#12   ./84-9-S05.xlsx 0.7109148 0.5367121  Sample5

或者regex_left_join ,使用regex_left_join from fuzzyjoin

library(fuzzyjoin)
regex_left_join(df1, df2, by = 'Section') %>%
    select(Section = Section.x, A, B, Names)
# A tibble: 12 x 4
#   Section               A     B Names   
#   <chr>             <dbl> <dbl> <chr>   
# 1 ./100-12-S00.xlsx 0.603 0.455 Baseline
# 2 ./108-15-S01.xlsx 0.711 0.537 Sample1 
# 3 ./92-12-S00.xlsx  0.603 0.455 Baseline
# 4 ./124-9-S05.xlsx  0.711 0.537 Sample5 
# 5 ./116-15-S11.xlsx 0.603 0.455 AASHTO  
# 6 ./108-9-S06.xlsx  0.711 0.537 Sample6 
# 7 ./84-12-S08.xlsx  0.603 0.455 Sample8 
# 8 ./124-15-S11.xlsx 0.711 0.537 AASHTO  
# 9 ./92-12-S00.xlsx  0.603 0.455 Baseline
#10 ./116-15-S03.xlsx 0.711 0.537 Sample3 
#11 ./100-12-S02.xlsx 0.603 0.455 Sample2 
#12 ./84-9-S05.xlsx   0.711 0.537 Sample5 

数据

df1 <- structure(list(Section = c("./100-12-S00.xlsx", "./108-15-S01.xlsx", 
"./92-12-S00.xlsx", "./124-9-S05.xlsx", "./116-15-S11.xlsx", 
"./108-9-S06.xlsx", "./84-12-S08.xlsx", "./124-15-S11.xlsx", 
"./92-12-S00.xlsx", "./116-15-S03.xlsx", "./100-12-S02.xlsx", 
"./84-9-S05.xlsx"), A = c(0.6032591, 0.7109148, 0.6032591, 0.7109148, 
0.6032591, 0.7109148, 0.6032591, 0.7109148, 0.6032591, 0.7109148, 
0.6032591, 0.7109148), B = c(0.4554364, 0.5367121, 0.4554364, 
0.5367121, 0.4554364, 0.5367121, 0.4554364, 0.5367121, 0.4554364, 
0.5367121, 0.4554364, 0.5367121)), class = "data.frame", row.names = c("1", 
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12"))

df2 <- structure(list(Section = c("S00", "S01", "S02", "S03", "S04", 
"S05", "S06", "S07", "S08", "S09", "S10", "S11"), Names = c("Baseline", 
"Sample1", "Sample2", "Sample3", "Sample4", "Sample5", "Sample6", 
"Sample7", "Sample8", "Sample9", "Sample10", "AASHTO")), 
 class = "data.frame", row.names = c("1", 
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12"))

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM