[英]How to subset a dataframe based on columns that match another dataframe?
我想通過將results.interaction.model
的列子集all.dnam
數據框來創建results.interaction.subset
數據框。
results.interaction.model
和all.dnam
數據框中共有 18 個公共列。
> which( colnames(results.interaction.model) %in% colnames(all.dnam) )
[1] 1 2 3 4 5 6 7 8 9 10 11 12 13 16 17 19 20 21
但是,當我根據公共列對results.interaction.model
進行子集化時,它只返回兩列。
results.interaction.subset <- as.data.frame(table(unlist(results.interaction.model[ which( colnames(results.interaction.model) %in% colnames(all.dnam) ), ])))
數據:
results.interaction.model
> dput(results.interaction.model[1:10])
structure(list(regionID = c("chr1:959207-959208", "chr1:959207-959208",
"chr1:959207-959208", "chr1:959207-959208", "chr1:959207-959208",
"chr1:1000481-1000482", "chr1:1116515-1116516", "chr1:1281750-1281751",
"chr1:1282131-1282132", "chr1:1282131-1282132"), probeID = c("cg04485075",
"cg04485075", "cg04485075", "cg04485075", "cg04485075", "cg00447632",
"cg08881995", "cg06909630", "cg00825734", "cg00825734"), target_symbol = c("NOC2L",
"NOC2L", "KLHL17", "KLHL17", "KLHL17", "ISG15", "C1orf159", "SCNN1D",
"SCNN1D", "SCNN1D"), target = c("ENSG00000188976", "ENSG00000188976",
"ENSG00000187961", "ENSG00000187961", "ENSG00000187961", "ENSG00000187608",
"ENSG00000131591", "ENSG00000162572", "ENSG00000162572", "ENSG00000162572"
), TF_symbol = c("ELK4", "THAP1", "THAP1", "KLF5", "ETV5", "RELA",
"KLF5", "PRDM1", "NFIC", "SNAI1"), TF = c("ENSG00000158711",
"ENSG00000131931", "ENSG00000131931", "ENSG00000102554", "ENSG00000244405",
"ENSG00000173039", "ENSG00000102554", "ENSG00000057657", "ENSG00000141905",
"ENSG00000124216"), distance_region_target_tss = c(100, 100,
-1375, -1375, -1375, -655, -153, 1313, 1694, 1694), target_region = c("chr1:944203-959309",
"chr1:944203-959309", "chr1:960584-965719", "chr1:960584-965719",
"chr1:960584-965719", "chr1:1001138-1014540", "chr1:1081818-1116361",
"chr1:1280436-1292029", "chr1:1280436-1292029", "chr1:1280436-1292029"
), met.IQR = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), RLM_DNAmGroup_pvalue = c(0.00533045223821893,
0.269498808026373, 0.106850951179939, 0.737849880311862, 0.0230970250677056,
0.0679623870851049, 0.0038827086445532, 0.721008613093728, 0.0774394444145674,
0.000612845575647114), RLM_DNAmGroup_fdr = c(0.0533045223821893,
0.336873510032967, 0.213701902359879, 0.779545220292938, 0.0769900835590187,
0.0679623870851049, 0.0038827086445532, 0.721008613093728, 0.0774394444145674,
0.00122569115129423), RLM_TF_pvalue = c(0.000240408628747302,
0.00967551181571213, 1.49822287909274e-09, 0.000292719662255525,
0.00618840879521954, 0.00179253431503446, 0.452246528946535,
0.00234418657279578, 1.7960719511656e-06, 0.517725724264309),
RLM_TF_fdr = c(0.000975732207518417, 0.0193510236314243,
1.49822287909274e-08, 0.000975732207518417, 0.0154710219880488,
0.00179253431503446, 0.452246528946535, 0.00234418657279578,
3.5921439023312e-06, 0.517725724264309), `RLM_DNAmGroup:TF_pvalue` = c(0.0477235974653309,
0.446924723241917, 0.18614609117704, 0.667529698315539, 0.0785854522508826,
0.0525551281674801, 0.000669386554848383, 0.662460106324088,
0.00893706117486159, 0.610417816470398), `RLM_DNAmGroup:TF_fdr` = c(0.324105485296749,
0.698683488173363, 0.4653652279426, 0.698683488173363, 0.324105485296749,
0.0525551281674801, 0.000669386554848383, 0.662460106324088,
0.0178741223497232, 0.610417816470398), RLM_DNAmGroup_estimate = c(-1.37422624654216,
-1.19017301712196, -4.03293591748801, -0.35645377491457,
-3.8339403287343, 12.0286802799871, -2.78005839590399, -0.655219229413673,
3.59394098562824, -3.55478802638191), RLM_TF_estimate = c(-0.190759510600258,
-0.232045171131347, -1.33229518558913, 0.298181138499663,
-0.371852537012647, 1.47733874118035, -0.0562213201987235,
-0.468753054985941, -0.589036223592858, 0.0879194889561481
), `RLM_DNAmGroup:TF_estimate` = c(0.142736765645357, 0.110237497587508,
0.444463040091438, -0.0556779464074764, 0.33964931110994,
-1.16778588303793, 0.402784394412433, 0.102709381127899,
-0.557414220417646, 0.0965723392687151), Model.quantile = c("Robust Linear Model",
"Robust Linear Model", "Robust Linear Model", "Robust Linear Model",
"Robust Linear Model", "Robust Linear Model", "Robust Linear Model",
"Robust Linear Model", "Robust Linear Model", "Robust Linear Model"
), Target_gene_DNAm_high_vs_Target_gene_DNAm_low_wilcoxon_pvalue = c(2.1412139799407e-08,
2.1412139799407e-08, 1.15677939082746e-06, 1.15677939082746e-06,
1.15677939082746e-06, 0.000615392235920677, 0.00298164989940998,
0.00366189819005736, 1.54702834819721e-12, 1.54702834819721e-12
), TF_DNAm_high_vs_TF_DNAm_low_wilcoxon_pvalue = c(0.918438542048966,
0.0464669455108157, 0.0464669455108157, 0.0697343857747443,
0.720893397374373, 0.519583665503581, 0.00993093689687174,
5.3480514392441e-16, 4.20885311279533e-08, 1.17793835284239e-10
), `% of target genes not expressed in DNAm_low and DNAm_high` = c("0 %",
"0 %", "0 %", "0 %", "0 %", "0 %", "0 %", "0 %", "0 %", "0 %"
)), row.names = c(NA, -10L), class = c("data.table", "data.frame"
), .internal.selfref = <pointer: 0x0000029cc3feb5c0>)
all.dnam
> dput(all.dnam[1:10])
structure(list(regionID = c("chr11:73647473-73647474", "chr12:26834132-26834133",
"chr10:49761673-49761674", "chr5:40841488-40841489", "chr16:68236355-68236356",
"chr5:103563028-103563029"), probeID = c("cg06008062", "cg07645296",
"cg02430797", "cg05981038", "cg04328477", "cg07655627"), target_symbol = c("PLEKHB1",
"ITPR2", "OGDHL", "CARD6", "ESRP2", "NUDT12"), target = c("ENSG00000021300",
"ENSG00000123104", "ENSG00000197444", "ENSG00000132357", "ENSG00000103067",
"ENSG00000112874"), TF_symbol = c("VDR", "FOXN3", "RUNX2", "SOX9",
"KLF5", "NFATC2"), TF = c("ENSG00000111424", "ENSG00000053254",
"ENSG00000124813", "ENSG00000125398", "ENSG00000102554", "ENSG00000101096"
), distance_region_target_tss = c(1294L, -937L, 704L, 179L, 1745L,
-237L), target_region = c("chr11:73646178-73662819", "chr12:26335352-26833194",
"chr10:49734641-49762379", "chr5:40841308-40860175", "chr16:68229033-68238102",
"chr5:103548855-103562790"), met.IQR = c(0L, 0L, 0L, 0L, 0L,
0L), RLM_DNAmGroup_pvalue = c(0.0611683203352245, 0.37071989654808,
0.0031516822161683, 0.0047522559428827, 0.581928610825936, 0.0010290648987458
)), row.names = c(62L, 73L, 56L, 8L, 115L, 9L), class = "data.frame")
這應該可以完成工作。 請注意,我將results.interaction.model
的名稱更改為d
並使用d_sub
而不是results.interaction.subset
,因為它們更短且更易於閱讀。
d <- results.interaction.model
d_sub <- d[, colnames(d) %in% colnames(all.dnam)]
您可以檢查 d_sub 是否具有正確的名稱:
dplyr::glimpse(d_sub)
Rows: 10
Columns: 10
$ regionID <chr> "chr1:959207-959208", "chr1:959207-959208", "ch…
$ probeID <chr> "cg04485075", "cg04485075", "cg04485075", "cg04…
$ target_symbol <chr> "NOC2L", "NOC2L", "KLHL17", "KLHL17", "KLHL17",…
$ target <chr> "ENSG00000188976", "ENSG00000188976", "ENSG0000…
$ TF_symbol <chr> "ELK4", "THAP1", "THAP1", "KLF5", "ETV5", "RELA…
$ TF <chr> "ENSG00000158711", "ENSG00000131931", "ENSG0000…
$ distance_region_target_tss <dbl> 100, 100, -1375, -1375, -1375, -655, -153, 1313…
$ target_region <chr> "chr1:944203-959309", "chr1:944203-959309", "ch…
$ met.IQR <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
$ RLM_DNAmGroup_pvalue <dbl> 0.0053304522, 0.2694988080, 0.1068509512, 0.737…
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.