简体   繁体   中英

Extract different words from a character string in R

I have seen several SO posts that seem to come close to answering this question but I cannot tell if any actually do so please forgive me is this is a duplicate post. I have several dozens of character strings (this a column within a data frame) that include different numbers, usually written out as words but sometimes as integers. Eg:

Three neonates with one adult

1 adult, ten neonates nearby

Two adults and six neonates

My ultimate goal is to be able to extract the number of neonates and adults from each string and get something like this:

data.frame(Adults=c(1,1,6), Neonates=c(3,10,6)

But the number and location of the number within the string varies. All of the examples I have seen using gsub , strsplit , etc. seem to only work when the pattern used to substitute, split, extract, etc. is the same across strings or stays in a constant position within the string. Since I know that the numbers must be c("one","two",...,"ten") , I could possibly loop through every character string and then loop through every possible number to see if it is present within the string and then, if present, extract it and convert to numeric. But this seems very inefficient.

Any help would be most appreciated!!

One potential approach using str_split from stringr package and a custom function to wrap finding matches and post processing. Dataset size has not been mentioned hence cannot test/comment on speed.

library(stringr) #for str_split

customFun = function(
strObj="Three neonates with one adult",
rootOne = "adult",
rootTwo = "neonate"){

#split string
discreteStr = str_split(strObj,pattern = "\\s+",simplify = TRUE)



#find indices of root words
rootOneIndex = grep(rootOne,discreteStr)
rootTwoIndex = grep(rootTwo,discreteStr)

#mapping vectors
charVec = c("one","two","three","four","five","six","seven","eight","nine","ten")
numVec = as.character(1:10)
names(numVec) = charVec

#match index neighbourhood ,-1/+1  and select first match
rootOneMatches = tolower(discreteStr[c(rootOneIndex-1,rootOneIndex+1)])
rootOneMatches = rootOneMatches[!is.na(rootOneMatches)]
rootOneMatches = head(rootOneMatches,1)


rootTwoMatches = tolower(discreteStr[c(rootTwoIndex-1,rootTwoIndex+1)])
rootTwoMatches = rootTwoMatches[!is.na(rootTwoMatches)]
rootTwoMatches = head(rootTwoMatches,1)

#check presence in mapping vectors
rootOneNum = intersect(rootOneMatches,c(charVec,numVec))
rootTwoNum = intersect(rootTwoMatches,c(charVec,numVec))

#final matches and numeric conversion
rootOneFinal = ifelse(!is.na(as.numeric(rootOneNum)),as.numeric(rootOneNum),as.numeric(numVec[rootOneNum]))
rootTwoFinal = ifelse(!is.na(as.numeric(rootTwoNum)),as.numeric(rootTwoNum),as.numeric(numVec[rootTwoNum]))

outDF = data.frame(strObj = strObj,adults = rootOneFinal,neonates = rootTwoFinal,stringsAsFactors=FALSE) 
return(outDF)
}

Output:

inputVec = c("Three neonates with one adult","1 adult, ten neonates nearby","Two adults and six neonates")
outputAggDF = suppressWarnings(do.call(rbind,lapply(inputVec,customFun)))

outputAggDF
#                         strObj adults neonates
#1 Three neonates with one adult      1        3
#2  1 adult, ten neonates nearby      1       10
#3   Two adults and six neonates      2        6

I was able to get the end result, but I'll admit my code isn't pretty.

string1 <- c("Three neonates with one adult")
string2 <- c("1 adult, ten neonates nearby")
string3 <- c("Two adults and six neonates")
df <- rbind(string1, string2, string3)

#change all written words to numeric values
df <- tolower(df)
df <- ifelse(grepl("one", df), gsub("one", 1, df), df)
df <- ifelse(grepl("two", df), gsub("two", 2, df), df)
df <- ifelse(grepl("three", df), gsub("three", 3, df), df)
df <- ifelse(grepl("four", df), gsub("four", 4, df), df)
df <- ifelse(grepl("five", df), gsub("five", 5, df), df)
df <- ifelse(grepl("six", df), gsub("six", 6, df), df)
df <- ifelse(grepl("seven", df), gsub("seven", 7, df), df)
df <- ifelse(grepl("eight", df), gsub("eight", 8, df), df)
df <- ifelse(grepl("nine", df), gsub("nine", 9, df), df)
df <- ifelse(grepl("ten", df), gsub("ten", 10, df), df)


#extract number and the next two spaces (gets a or n for adult or neonates)
number_let <- gregexpr('[0-9]+..',df)
list_nl <- regmatches(df,number_let)

df <- as.data.frame(df)
new_df <- data.frame(matrix(unlist(list_nl), nrow=nrow(df), byrow=T))
> new_df
   X1   X2
1 3 n  1 a
2 1 a 10 n
3 2 a  6 n

new_df$X1 <- as.character(new_df$X1)
new_df$X2 <- as.character(new_df$X2)

#extract numeric values
FW <- data.frame(matrix(unlist(regmatches(new_df$X1,gregexpr('[0-9]+',new_df$X1))), nrow=nrow(df), byrow=T))
SW <- data.frame(matrix(unlist(regmatches(new_df$X2,gregexpr('[0-9]+',new_df$X2))), nrow=nrow(df), byrow=T))

new_df <- cbind(new_df, FW, SW)
colnames(new_df)[3:4] <- c("FW", "SW")

new_df$FW <- as.numeric(as.character(new_df$FW))
new_df$SW <- as.numeric(as.character(new_df$SW))

#get numeric value separated into neonates and adults
new_df$neonate_1 <- ifelse(grepl("n", new_df$X1), new_df$FW,0)
new_df$neonate_2 <- ifelse(grepl("n", new_df$X2), new_df$SW,0)
new_df$adult_1 <-ifelse(grepl("a", new_df$X1), new_df$FW,0)
new_df$adult_2 <- ifelse(grepl("a", new_df$X2), new_df$SW,0)

#total neonates and adults for each string
new_df$total_neo <- new_df$neonate_1 + new_df$neonate_2
new_df$total_adu <- new_df$adult_1 + new_df$adult_2

#extract the two final columns
Count <- new_df[,9:10]
colnames(Count) <- c("Neonates", "Adults")

> Count
  Neonates Adults
1        3      1
2       10      1
3        6      2

Others were a little bit faster, but here's a slightly different approach if you're interested.

In my eyes, the main problem is the replacement of the "one" "two" etc. strings, which is pretty tedious to type out and impossible for high numbers.

strings <- c("Three neonates with one adult",
"1 adult, ten neonates nearby",
"Two adults and six neonates")

numbers <- c("one","two","three","four","five","six","seven","eight","nine","ten")

splitted <- unlist(strsplit(strings, split="[[:blank:] | [:punct:]]"))

ind_neon <- which((splitted == "neonates") | (splitted == "neonate"))
ind_adul <- which((splitted == "adults") | (splitted == "adult"))

neon <- tolower(splitted[ind_neon-1])
adul <- tolower(splitted[ind_adul-1])

neon2 <- as.numeric(neon)
neon2[is.na(neon2)] <- as.numeric(factor(neon[is.na(neon2)],
               levels=numbers,
               labels=(1:10)))

adul2 <- as.numeric(adul)
adul2[is.na(adul2)] <- as.numeric(factor(adul[is.na(adul2)],
                levels=numbers,
                labels=(1:10)))

adul2
# [1] 1 1 2
neon2
# [1]  3 10  6

There are surely more efficient options, but this does the trick, and could be adapted to use more numbers if you add them to the pattern vectors.

library(stringr)
library(qdap)
library(tidyr)

Bring in data

 v <- tolower(c("Three neonates with one adult",
           "1 adult, ten neonates nearby",
           "Two adults and six neonates"))

Assign word and number vectors for patterns

words<- c("one","two","three","four","five","six","seven","eight","nine","ten")
nums <- seq(1, 10)
pattern <- c(words, nums)

Extract and paste together all numbers and types

w <- paste(unlist(str_extract_all( v, paste(pattern, collapse="|"))),
           unlist(str_extract_all( v, "neonate|adult")))

Use mutliple gsub from qdap to replace all written numbers with their correponding integer

w <- mgsub(words, nums, w)
w <- do.call(rbind.data.frame, strsplit(w, " "))
names(w) <- c("numbers", "name")

Generate rowid so you can spread the data.

w$row <- rep(1:(nrow(w)/2), each=2)
spread(w, name, numbers)[-c(1)]


#    adult neonate
#  1     1       3
#  2     1      10
#  3     2       6

strapply in gsubfn package allows to extract words as shown below. I could not find any in-built function to convert words to digits or vice versa, but there might be pre-built functions created by other users.

> library(gsubfn)
> df <- data.frame(Text = c("Three neonates with one adult","1 adult, ten neonates nearby","Two adults and six neonates"))
> df
                           Text
1 Three neonates with one adult
2  1 adult, ten neonates nearby
3   Two adults and six neonates

> for(i in 1:nrow(df)){
+     
+     df$Adults[i] <- strapply(as.character(df$Text[i]), "(\\w+) adult*")
+     df$Neonates[i] <- strapply(as.character(df$Text[i]), "(\\w+) neonate*")
+     
+ }

> df
                           Text Adults Neonates
1 Three neonates with one adult    one    Three
2  1 adult, ten neonates nearby      1      ten
3   Two adults and six neonates    Two      six

This is a simple answer using only base R without any fancy package ;-)

If you only have between 1 and 10 neonates/adults, and if they always come in your string as X adult(s) and Y neonate(s) (ie, the number before the category), then it is pretty simple:

df = data.frame(strings = c("Three neonates with one adult",
                            "1 adult, ten neonates nearby",
                            "Two adults and six neonates"))

littnums = c('one', 'two', 'three', 'four', 'five', 
             'six', 'seven', 'eight', 'nine', 'ten')
nums = 1:10

getnums = function(mystring, mypattern) {
  # split your string at all spaces
  mysplitstring = unlist(strsplit(mystring, split=' '))
  # The number you are looking for is before the pattern
  numBeforePattern = mysplitstring[grep(mypattern, mysplitstring) - 1]
  # Then convert it to a integer or, if it fails, translate it 
  ifelse(is.na(suppressWarnings(as.integer(numBeforePattern))), 
         nums[grep(tolower(numBeforePattern), littnums)], 
         as.integer(numBeforePattern))
}

df$Neonates = sapply(as.vector(df$strings), FUN=getnums, 'neonate')
df$Adults = sapply(as.vector(df$strings), FUN=getnums, 'adult')
df
#                         strings Neonates Adults
# 1 Three neonates with one adult        3      1
# 2  1 adult, ten neonates nearby       10      1
# 3   Two adults and six neonates        6      2

Here's another approach

Your data

S <- c("Three neonates with one adult", "1 adult, ten neonates nearby", "Two adults and six neonates")

dplyr and stringr approach

library(stringr)
library(dplyr)

searchfor <- c("neonates", "adult")         
words <- str_extract_all(S, boundary("word"))   # keep only words

This next statement will grab the word before all searchfor words, and save as a data.frame

chrnum <- as.data.frame(Reduce(cbind, lapply(searchfor, function(y) lapply(words, function(x) x[which(x %in% y)-1]))))

This next statement will str_replace_all using a named vector and convert to numeric

replaced <- chrnum %>% 
              mutate_all(funs(as.numeric(str_replace_all(tolower(.), c("one" = "1", "two" = "2", "three" = "3", "four" = "4", "five" = "5", "six" = "6", "seven" = "7", "eight" = "8", "nine" = "9", "ten" = "10"))))) %>%
              setNames(searchfor)

NOTE you will receive a warning about coercing an NA value

Output

  neonates adult
1        3     1
2       10     1
3        6    NA

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM