I have strings like the following:
b="+sl_1__1_1_1+5*from_st_1_4_1_1_1-70*into_st_4_1_1_1_1-sl_1__1_1_2"
b1 ="+sh_8_6_1_1_1-1000*sdp_8_6_1_1_1"
I am trying to split it into the following:
b: Variables Coefficient
sl_1__1_1_1 1
from_st_1_4_1_1_1 5
into_st_4_1_1_1_1 -70
sl_1__1_1_2 -1
b1: Variables Coefficient
sh_8_6_1_1_1 1
sdp_8_6_1_1_1 -1000
The strsplit function I am using at the moment is not able to pick up coefficients with more than one digit (ie the 1000 coefficient).
Any help would be much appreciated.
Here is one approach with strpslit
:
b= "+fg_1+5*ug1_1-7*tg_4" # original example string
Coefficient <- as.numeric(sub("\\+$", "1", strsplit(b, "\\**[a-z0-9]+_\\d", perl=TRUE)[[1]]))
Variable <- sub("\\+", "", strsplit(b, "[+-]\\d\\**", perl=TRUE)[[1]])
data.frame(Variable, Coefficient)
# Variable Coefficient
#1 fg_1 1
#2 ug1_1 5
#3 tg_4 -7
Explanation of \\\\**[a-z0-9]+_\\\\d
:
\\\\**
*
means repeated 0 or more times. Could easily use ?
for optional instead: \\\\*?
[a-z0-9]
+
one or more times: [a-z0-9]+
_
\\\\d
# new sample strings
b="+sl_1__1_1_1+5*from_st_1_4_1_1_1-70*into_st_4_1_1_1_1-sl_1__1_1_2"
#b="+sl_1__1_1_1+5*from_st_1_4_1_1_1-70*into_st_4_1_1_1_1-sl_1__1_1_2+50"
#b ="+sh_8_6_1_1_1-1000*sdp_8_6_1_1_1"
#b= "+fg_1+5*ug1_1-7*tg_4" # 1st sample string from original question
Variable <- strsplit(b, "[+-][0-9]*\\**", perl=TRUE)[[1]]
Variable <- Variable[!Variable == ""]
Coefficient <- as.numeric(sub("([+-]$)", "\\11",
strsplit(b, "(?<=[+-])\\D+.*?(?=[+-]|$)|(?<=\\d)\\*.*?(?=[+-]|$)",
perl=TRUE)[[1]]))
# handle possible constant at end of string:
ifelse(length(Coefficient) == (length(Variable)+1L),
df <- data.frame(Variable=c(Variable, "constant"), Coefficient),
df <- data.frame(Variable, Coefficient))
df
# Variable Coefficient
#1 sl_1__1_1_1 1
#2 from_st_1_4_1_1_1 5
#3 into_st_4_1_1_1_1 -70
#4 sl_1__1_1_2 -1
Here's another
b <- "+sl_1__1_1_1+5*from_st_1_4_1_1_1-70*into_st_4_1_1_1_1-sl_1__1_1_2"
b1 <- "+sh_8_6_1_1_1-1000*sdp_8_6_1_1_1"
b2 <- "+sl_1__1_1_1+5*from_st_1_4_1_1_1-70*into_st_4_1_1_1_1-sl_1__1_1_2+50"
First step, to make life easier, is to add in 1s when no coefficient is given, and then use the regex I mentioned in the comments.
(b <- gsub('([+-])(\\D)', '\\11+\\2', b2))
# [1] "+1+sl_1__1_1_1+5*from_st_1_4_1_1_1-70*into_st_4_1_1_1_1-1+sl_1__1_1_2"
(bb <- regmatches(b, gregexpr('[+-]?\\w+', text = b))[[1]])
# [1] "+1" "+sl_1__1_1_1" "+5"
# [4] "from_st_1_4_1_1_1" "-70" "into_st_4_1_1_1_1"
# [7] "-1" "+sl_1__1_1_2"
then do some final re-arranging and formatting
(bb <- data.frame(matrix(bb, ncol = 2, byrow = TRUE)[, 2:1]))
# X1 X2
# 1 +sl_1__1_1_1 +1
# 2 from_st_1_4_1_1_1 +5
# 3 into_st_4_1_1_1_1 -70
# 4 +sl_1__1_1_2 -1
within(bb, {
X1 <- gsub('\\-|\\+', '', as.character(X1))
X2 <- as.numeric(as.character(X2))
})
# X1 X2
# 1 sl_1__1_1_1 1
# 2 from_st_1_4_1_1_1 5
# 3 into_st_4_1_1_1_1 -70
# 4 sl_1__1_1_2 -1
This would be easier to use if you put it into a function. I also added a check for trailing constant terms
f <- function(x) {
## check constant
x <- gsub('([+-]\\d+)$', '\\1*constant', x)
x <- gsub('([+-])(\\D)', '\\11+\\2', x)
x <- regmatches(x, gregexpr('[+-]?\\w+', text = x))[[1]]
x <- data.frame(matrix(x, ncol = 2, byrow = TRUE)[, 2:1],
stringsAsFactors = FALSE)
x[, 2] <- as.numeric(x[, 2])
x[, 1] <- gsub('\\-|\\+', '' , x[, 1])
setNames(x, c('Variable','Coefficient'))
}
f(b)
# Variable Coefficient
# 1 sl_1__1_1_1 1
# 2 from_st_1_4_1_1_1 5
# 3 into_st_4_1_1_1_1 -70
# 4 sl_1__1_1_2 -1
f(b1)
# Variable Coefficient
# 1 sh_8_6_1_1_1 1
# 2 sdp_8_6_1_1_1 -1000
f(b2)
# Variable Coefficient
# 1 sl_1__1_1_1 1
# 2 from_st_1_4_1_1_1 5
# 3 into_st_4_1_1_1_1 -70
# 4 sl_1__1_1_2 -1
# 5 constant 50
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.