繁体   English   中英

通过将函数应用于数据框中的另一列来实现新列

[英]New column by applying function to another column in data frame

这是我正在使用的数据的示例数据框。 对于熟悉遗传数据格式的人来说,它基本上是一个修改过的VCF文件。 如果不是,基本上每行包含基因组中可能存在变体的位置的信息。

samp <- structure(list(Chrom = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L), .Label = "chr12", class = "factor"), 
    Pos = c(8613204L, 8613412L, 8614238L, 8614506L, 8614652L, 
    8614669L, 8614768L, 8614951L, 8614986L, 8615225L, 8615809L, 
    8616149L, 8616392L), Ref = structure(c(1L, 1L, 4L, 3L, 3L, 
    3L, 2L, 3L, 2L, 4L, 2L, 4L, 3L), .Label = c("A", "C", "G", 
    "T"), class = "factor"), Alt = structure(c(3L, 2L, 2L, 1L, 
    1L, 1L, 3L, 1L, 1L, 3L, 4L, 2L, 4L), .Label = c("A", "C", 
    "G", "T"), class = "factor"), Info = c("AC=3913;AF=0.78135;AN=5008;NS=2504;DP=-128;EAS_AF=0.9921;AMR_AF=0.8357;AFR_AF=0.5779;EUR_AF=0.7366;SAS_AF=0.8466;AA=G|||;CSQ=G|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.1881", 
    "AC=4051;AF=0.808906;AN=5008;NS=2504;DP=-128;EAS_AF=0.9921;AMR_AF=0.8444;AFR_AF=0.6725;EUR_AF=0.7366;SAS_AF=0.8538;AA=C|||;CSQ=C|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.1881", 
    "AC=4021;AF=0.802915;AN=5008;NS=2504;DP=-128;EAS_AF=0.9921;AMR_AF=0.8415;AFR_AF=0.6558;EUR_AF=0.7376;SAS_AF=0.8466;AA=T|||;CSQ=C|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.7997", 
    "AC=3990;AF=0.796725;AN=5008;NS=2504;DP=-128;EAS_AF=0.9921;AMR_AF=0.8386;AFR_AF=0.6339;EUR_AF=0.7376;SAS_AF=0.8466;AA=A|||;CSQ=A|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.1881", 
    "AC=4069;AF=0.8125;AN=5008;NS=2504;DP=17188;EAS_AF=0.9921;AMR_AF=0.8487;AFR_AF=0.6528;EUR_AF=0.7714;SAS_AF=0.8599;AA=A|||;CSQ=A|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.0029", 
    "AC=4044;AF=0.807508;AN=5008;NS=2504;DP=-128;EAS_AF=0.9911;AMR_AF=0.8458;AFR_AF=0.6362;EUR_AF=0.7714;SAS_AF=0.8599;AA=G|||;CSQ=A|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.0029", 
    NA, NA, "AC=3795;AF=0.757788;AN=5008;NS=2504;DP=-128;EAS_AF=0.9653;AMR_AF=0.7954;AFR_AF=0.5651;EUR_AF=0.7167;SAS_AF=0.82;AA=c|||;CSQ=A|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.0029", 
    NA, "AC=4053;AF=0.809305;AN=5008;NS=2504;DP=-128;EAS_AF=0.9921;AMR_AF=0.8458;AFR_AF=0.6362;EUR_AF=0.7724;SAS_AF=0.8671;AA=C|||;CSQ=T|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.0029", 
    "AC=4076;AF=0.813898;AN=5008;NS=2504;DP=-128;EAS_AF=0.9921;AMR_AF=0.8473;AFR_AF=0.6528;EUR_AF=0.7724;SAS_AF=0.8671;AA=C|||;CSQ=C|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.0029", 
    "AC=4052;AF=0.809105;AN=5008;NS=2504;DP=-128;EAS_AF=0.9921;AMR_AF=0.8473;AFR_AF=0.6346;EUR_AF=0.7724;SAS_AF=0.8671;AA=T|||;CSQ=T|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.0029"
    ), TG_rs = c("rs10770739", "rs10770740", "rs4883148", "rs4883149", 
    "rs4883150", "rs4883151", NA, NA, "rs7303948", NA, "rs4242889", 
    "rs4883154", "rs4242890")), row.names = c(NA, -13L), .Names = c("Chrom", 
"Pos", "Ref", "Alt", "Info", "TG_rs"), class = "data.frame")

我想要做的是从“信息”列中提取值。 但是,此列中包含的信息对于每一行都不相同,并且不总是以相同的顺序出现。 因此,我想使用模式匹配来获取我感兴趣的值。

我写了一个小功能来提取信息栏中包含的各种“超级人群”(例如AMR,AFR,EUR,SAS,EAS)的“等位基因频率”(AF)。

extractAF <- function(pop, vec) {
  info <- unlist((strsplit(vec, ";", fixed=TRUE)))
  AF <- as.numeric(unlist(strsplit((info[grep(pop, (unlist((strsplit(vec, ";", fixed=TRUE)))))]), "=", fixed=TRUE))[2])
  return(AF)
}

这个函数需要两个参数:'pop'是一个字符串,用于指定要提取的超级填充,以及'vec',它用于获取我的数据帧的Info列。

通过单个向量时,该函数按预期工作:

extractAF("AFR_AF", samp[1,'Info'])
#[1] 0.5779

extractAF("AFR_AF", samp[5,'Info'])
#[1] 0.6528

但是,我希望对数据帧的每一行执行此操作,并创建包含数据的新列。 当我使用dplyr的mutate函数时,我最后得到一个具有相同值的列:

library("dplyr")
mutate(samp, AFR_AF = extractAF("AFR_AF", Info))

我读了一篇文章(我现在似乎无法找到,否则我会引用它),表示mutate一次传递所有行,而不是我需要的逐行传递。

所以我根据这篇文章尝试了以下几种方法:

apply(samp[,'Info'], 1, function(x) extractAF("AFR_AF", x))

应用中出错(samp [,“Info”],1,function(x)extractAF(“AMR_AF”,x)):dim(X)必须具有正长度

samp[, extractAF("AMR_AF", Info), by = .I]

[.data.frame (samp ,, extractAF(“AMR_AF”,Info)中的错误,by = .I):未使用的参数(by = .I)

samp[, extractAF("AMR_AF", Info), by = 1:nrow(samp)]

Error in `[.data.frame`(samp, , extractAF("AMR_AF", Info), by = 1:nrow(samp)) : 
  unused argument (by = 1:nrow(samp))

UPDATE

在下面的INFO列中包含NA和AF = 0的其他样本数据集:

结构(列表(CHROM = c(“chr1”,“chr1”,“chr1”,“chr1”,“chr1”,“chr1”),POS = c(16090898L,16091074L,16091583L,16092212L,16093560L,16093639L), ID = c(“rs6429774”,“rs6429776”,NA,“rs74528955”,“rs904912”,NA),REF = c(“G”,“A”,“T”,“C”,“T”,“ C“),ALT = c(”A“,”G“,”A“,”T“,”A“,”T“),QUAL = c(NA,NA,NA,NA,NA,NA), FILTER = c(NA,NA,NA,NA,NA,NA),INFO = c(“AC = 1606; AF = 0.320687; AN = 5008; NS = 2504; DP = 21565; EAS_AF = 0.1419; AMR_AF = 0.2983; AFR_AF = 0.525; EUR_AF = 0.3509; SAS_AF = 0.2137; AA = G |||; CSQ = A | ENSG00000162458 | ENST00000441801 |文稿| upstream_gene_variant ||||||| 96 | 1 ||||||; ERB = A || proximal_1216 | Regulatory_Feature | proximal_enhancer; FUNSEQ = 0.3335“,”AC = 1690; AF = 0.33746; AN = 5008; NS = 2504; DP = 20247; EAS_AF = 0.1498; AMR_AF = 0.3012; AFR_AF = 0.5681; EUR_AF = 0.3549; SAS_AF = 0.227; AA = G |||; CSQ = G | ENSG00000162458 | ENST00000441801 |文稿| 5_prime_UTR_variant | 81 ||||||| 1 ||||||; ERB = G || proximal_1216 | Regulatory_Feature | proximal_enhancer; FUNSEQ = 0.3335“,NA,”AC = 8; AF = 0.00159744; AN = 5008; NS = 2504; DP = 19197; EAS_AF = 0.0 079; AMR_AF = 0; AFR_AF = 0; EUR_AF = 0; SAS_AF = 0; AA = C |||; CSQ = T | ENSG00000162458 | ENST00000441801 |文稿| intron_variant |||||||| 1 ||||| |; GENCODE = ENST00000441801; ERB = T || proximal_1216 | Regulatory_Feature | proximal_enhancer; FUNSEQ = 0.3335“,”AC = 3282; AF = 0.655351; AN = 5008; NS = 2504; DP = 14721; EAS_AF = 0.8343; AMR_AF = 0.6916 ; AFR_AF = 0.4259; EUR_AF = 0.6531; SAS_AF = 0.7577; AA = A |||; CSQ = A | ENSG00000162458 | ENST00000441801 |文稿| intron_variant |||||||| 1 ||||||; GENCODE = ENST00000441801 ; FUNSEQ = 0.1483“,”AC = 5; AF = 0.000998403; AN = 5008; NS = 2504; DP = 14736; EAS_AF = 0.003; AMR_AF = 0; AFR_AF = 0; EUR_AF = 0; SAS_AF = 0.002; AA = C |||; CSQ = T | ENSG00000162458 | ENST00000441801 | Transcript | intron_variant |||||||| 1 ||||||; GENCODE = ENST00000441801; FUNSEQ = 0.1483“)),row.names = 14:19, class =“data.frame”,。Name = c(“CHROM”,“POS”,“ID”,“REF”,“ALT”,“QUAL”,“FILTER”,“INFO”))

由于sub是矢量化的,因此您可能不需要这些公式。 首先创建所有可能代码的变量,如(AFR,AMR,EUR等)。 使用该向量创建搜索模式以浏览“ Info列并返回包含所有匹配项的新数据框:

all_pop <- c("AMR_AF", "AFR_AF", "EUR_AF", "SAS_AF", "EAS_AF")
pat <- paste0(".*\\b", all_pop, "=(\\d+(\\.\\d+)?)\\b.*")

out <- sapply(pat, sub, "\\1", samp$Info)
newdf <- setNames(as.data.frame(out), all_pop)
#      AMR_AF AFR_AF EUR_AF SAS_AF EAS_AF
#   1  0.8357 0.5779 0.7366 0.8466 0.9921
#   2  0.8444 0.6725 0.7366 0.8538 0.9921
#   3  0.8415 0.6558 0.7376 0.8466 0.9921
#   4  0.8386 0.6339 0.7376 0.8466 0.9921
#   5  0.8487 0.6528 0.7714 0.8599 0.9921
#   6  0.8458 0.6362 0.7714 0.8599 0.9911
#   7    <NA>   <NA>   <NA>   <NA>   <NA>
#   8    <NA>   <NA>   <NA>   <NA>   <NA>
#   9  0.7954 0.5651 0.7167   0.82 0.9653
#   10   <NA>   <NA>   <NA>   <NA>   <NA>
#   11 0.8458 0.6362 0.7724 0.8671 0.9921
#   12 0.8473 0.6528 0.7724 0.8671 0.9921
#   13 0.8473 0.6346 0.7724 0.8671 0.9921

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM