简体   繁体   中英

Regex to extract string between characters in R

I have a pretty confusing file, with multiple delimiters and not the same order/number of fields between rows.

Columns V1 to V5 are fine but I want to extract from V9 the information from "Variant_seq", "Reference_seq" and the rsxxxx number from "Dbxref".

Another complication is that the "Variant_seq" and "Reference_seq" fields can either single characters ("A","T","C" or "G") or multiple ones comma separated (example "TTTT,TTC,GGGGGC"). These fields can be located either at the end or anywhere in the middle of V9.

V1 V2    V3    V4    V5     V9
9 dbSNP SNV 10007 10007 ID=1;Variant_seq=C;Dbxref=dbSNP_154:rs1449034754;evidence_values=Frequency,TOPMed;Reference_seq=T
9 dbSNP SNV 10009 10009 ID=2;Variant_seq=C,G;Dbxref=dbSNP_154:rs1587255763;evidence_values=Frequency;Reference_seq=A
9 dbSNP SNV 14824990 14824990 ID=30545117;Reference_seq=C;clinical_significance=benign;Variant_seq=GGGC,CCCCG;ancestral_allele=C;Dbxref=dbSNP_154:rs140144319;evidence_values=Frequency,1000Genomes,ESP,Phenotype_or_Disease,ExAC,TOPMed,gnomAD;global_minor_allele_frequency=1|0.004193|211

I originally thought of a awk -F '{print }' with multiple delimiters but quickly realized this wasn't a viable solution as fields aren't consistend across rows. dplyr::separate also isn't really adapted here.

I tried to extract each single field separately into a new column but the command doesn't handle the case where the fields are located at the end of the row:

gsub("Reference_seq[=]([^.]+)[;].*", "\\1", df$V9)

I can't find a solution to grep only the field in the capture group 1 and stop if there's no ";"following. Thanks for your help.

Since this is somewhat complex extraction, I find using the stringr package's str_extract() function easier to use here.

In this case, I am using 3 separate lines to extract the text of interest. and using the (?<=) look behind operator to avoid the leading text.

df<- read.table(header=TRUE, text="V1 V2    V3    V4    V5     V9
9 dbSNP SNV 10007 10007 ID=1;Variant_seq=C;Dbxref=dbSNP_154:rs1449034754;evidence_values=Frequency,TOPMed;Reference_seq=T
9 dbSNP SNV 10009 10009 ID=2;Variant_seq=C,G;Dbxref=dbSNP_154:rs1587255763;evidence_values=Frequency;Reference_seq=A
9 dbSNP SNV 14824990 14824990 ID=30545117;Reference_seq=C;clinical_significance=benign;Variant_seq=GGGC,CCCCG;ancestral_allele=C;Dbxref=dbSNP_154:rs140144319;evidence_values=Frequency,1000Genomes,ESP,Phenotype_or_Disease,ExAC,TOPMed,gnomAD;global_minor_allele_frequency=1|0.004193|211"
)

library(stringr)
str_extract(df[,"V9"], "(?<=Variant_seq=).+?;")
str_extract(df[,"V9"], "(?<=Reference_seq=).+?;")
str_extract(df[,"V9"], "(?<=Dbxref=).+?;")

data.frame(Variant_seq, Reference_seq, Db_ref)
#   Variant_seq Reference_seq                  Db_ref
# 1          C;          <NA> dbSNP_154:rs1449034754;
# 2        C,G;          <NA> dbSNP_154:rs1587255763;
# 3 GGGC,CCCCG;            C;  dbSNP_154:rs140144319;

This final data frame can now be cbind back to your original

You can do

stringr::str_match(df$V9, "Reference_seq=([^;]+);")[, 2L]
stringr::str_match(df$V9, "Variant_seq=([^;]+);")[, 2L]
stringr::str_match(df$V9, "Dbxref=([^;]+);")[, 2L]

Output

> stringr::str_match(df$V9, "Reference_seq=([^;]+);")[, 2L]
[1] NA  NA  "C"
> stringr::str_match(df$V9, "Variant_seq=([^;]+);")[, 2L]
[1] "C"          "C,G"        "GGGC,CCCCG"
> stringr::str_match(df$V9, "Dbxref=([^;]+);")[, 2L]
[1] "dbSNP_154:rs1449034754" "dbSNP_154:rs1587255763" "dbSNP_154:rs140144319" 

A possible solution is to use strsplit on the strings paired with sub for the substrings

cbind(df1, sapply(c("Variant","Reference_seq","Dbxref"), function(str) 
  sapply(strsplit(df1[,"V9"],";"), function(x) sub("dbSNP_.*:","",x[grep(str,x)]))))
  V1    V2  V3       V4       V5
1  9 dbSNP SNV    10007    10007
2  9 dbSNP SNV    10009    10009
3  9 dbSNP SNV 14824990 14824990
                                                                                                                                                                                                                                                              V9
1                                                                                                                                                              ID=1;Variant_seq=C;Dbxref=dbSNP_154:rs1449034754;evidence_values=Frequency,TOPMed;Reference_seq=T
2                                                                                                                                                                   ID=2;Variant_seq=C,G;Dbxref=dbSNP_154:rs1587255763;evidence_values=Frequency;Reference_seq=A
3 ID=30545117;Reference_seq=C;clinical_significance=benign;Variant_seq=GGGC,CCCCG;ancestral_allele=C;Dbxref=dbSNP_154:rs140144319;evidence_values=Frequency,1000Genomes,ESP,Phenotype_or_Disease,ExAC,TOPMed,gnomAD;global_minor_allele_frequency=1|0.004193|211
                 Variant   Reference_seq              Dbxref
1          Variant_seq=C Reference_seq=T Dbxref=rs1449034754
2        Variant_seq=C,G Reference_seq=A Dbxref=rs1587255763
3 Variant_seq=GGGC,CCCCG Reference_seq=C  Dbxref=rs140144319

Data

df1 <- structure(list(V1 = c(9L, 9L, 9L), V2 = c("dbSNP", "dbSNP", "dbSNP"
), V3 = c("SNV", "SNV", "SNV"), V4 = c(10007L, 10009L, 14824990L
), V5 = c(10007L, 10009L, 14824990L), V9 = c("ID=1;Variant_seq=C;Dbxref=dbSNP_154:rs1449034754;evidence_values=Frequency,TOPMed;Reference_seq=T", 
"ID=2;Variant_seq=C,G;Dbxref=dbSNP_154:rs1587255763;evidence_values=Frequency;Reference_seq=A", 
"ID=30545117;Reference_seq=C;clinical_significance=benign;Variant_seq=GGGC,CCCCG;ancestral_allele=C;Dbxref=dbSNP_154:rs140144319;evidence_values=Frequency,1000Genomes,ESP,Phenotype_or_Disease,ExAC,TOPMed,gnomAD;global_minor_allele_frequency=1|0.004193|211"
), new = list(c("Variant_seq=C", "Dbxref=rs1449034754", "Reference_seq=T"
), c("Variant_seq=C,G", "Dbxref=rs1587255763", "Reference_seq=A"
), c("Reference_seq=C", "Variant_seq=GGGC,CCCCG", "Dbxref=rs140144319"
))), row.names = c(NA, -3L), class = "data.frame")

This extracts all the subfields of V9 into separate columns without using regular expressions or packages. It converts V9 to dcf format using paste and chartr and then reads it in using read.dcf. Finally we append the created columns to DF.

m <- DF$V9 |>
  paste(collapse = "\n\n") |>
  chartr(old = "=;", new = ":\n") |>
  textConnection() |>
  read.dcf()

DF2 <- cbind(DF, m)

> str(DF2)
'data.frame':   3 obs. of  14 variables:
 $ V1                           : int  9 9 9
 $ V2                           : chr  "dbSNP" "dbSNP" "dbSNP"
 $ V3                           : chr  "SNV" "SNV" "SNV"
 $ V4                           : int  10007 10009 14824990
 $ V5                           : int  10007 10009 14824990
 $ V9                           : chr  "ID=1;Variant_seq=C;Dbxref=dbSNP_154:rs1449034754;evidence_values=Frequency,TOPMed;Reference_seq=T" "ID=2;Variant_seq=C,G;Dbxref=dbSNP_154:rs1587255763;evidence_values=Frequency;Reference_seq=A" "ID=30545117;Reference_seq=C;clinical_significance=benign;Variant_seq=GGGC,CCCCG;ancestral_allele=C;Dbxref=dbSNP"| __truncated__
 $ ID                           : chr  "1" "2" "30545117"
 $ Variant_seq                  : chr  "C" "C,G" "GGGC,CCCCG"
 $ Dbxref                       : chr  "dbSNP_154:rs1449034754" "dbSNP_154:rs1587255763" "dbSNP_154:rs140144319"
 $ evidence_values              : chr  "Frequency,TOPMed" "Frequency" "Frequency,1000Genomes,ESP,Phenotype_or_Disease,ExAC,TOPMed,gnomAD"
 $ Reference_seq                : chr  "T" "A" "C"
 $ clinical_significance        : chr  NA NA "benign"
 $ ancestral_allele             : chr  NA NA "C"
 $ global_minor_allele_frequency: chr  NA NA "1|0.004193|211"

Alternately write it like this:

cbind(
  DF, 
  read.dcf(textConnection(chartr("=;", ":\n", paste(DF$V9, collapse = "\n\n"))))
)

Note

The input DF in reproducible form.

DF <- structure(list(V1 = c(9L, 9L, 9L), V2 = c("dbSNP", "dbSNP", "dbSNP"
), V3 = c("SNV", "SNV", "SNV"), V4 = c(10007L, 10009L, 14824990L
), V5 = c(10007L, 10009L, 14824990L), V9 = c("ID=1;Variant_seq=C;Dbxref=dbSNP_154:rs1449034754;evidence_values=Frequency,TOPMed;Reference_seq=T", 
"ID=2;Variant_seq=C,G;Dbxref=dbSNP_154:rs1587255763;evidence_values=Frequency;Reference_seq=A", 
"ID=30545117;Reference_seq=C;clinical_significance=benign;Variant_seq=GGGC,CCCCG;ancestral_allele=C;Dbxref=dbSNP_154:rs140144319;evidence_values=Frequency,1000Genomes,ESP,Phenotype_or_Disease,ExAC,TOPMed,gnomAD;global_minor_allele_frequency=1|0.004193|211"
)), class = "data.frame", row.names = c(NA, -3L))

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM