简体   繁体   中英

How to remove coefficients form R formula string

I have strings like the following:

b="+sl_1__1_1_1+5*from_st_1_4_1_1_1-70*into_st_4_1_1_1_1-sl_1__1_1_2"
b1 ="+sh_8_6_1_1_1-1000*sdp_8_6_1_1_1"

I am trying to split it into the following:

b:  Variables             Coefficient   
     sl_1__1_1_1                 1
    from_st_1_4_1_1_1            5
     into_st_4_1_1_1_1          -70
     sl_1__1_1_2                -1
b1: Variables                   Coefficient       
    sh_8_6_1_1_1                 1
    sdp_8_6_1_1_1              -1000

The strsplit function I am using at the moment is not able to pick up coefficients with more than one digit (ie the 1000 coefficient).

Any help would be much appreciated.

Here is one approach with strpslit :

b= "+fg_1+5*ug1_1-7*tg_4" # original example string

Coefficient <- as.numeric(sub("\\+$", "1", strsplit(b, "\\**[a-z0-9]+_\\d", perl=TRUE)[[1]]))
Variable <- sub("\\+", "", strsplit(b, "[+-]\\d\\**", perl=TRUE)[[1]])
data.frame(Variable, Coefficient)
#  Variable Coefficient
#1     fg_1           1
#2    ug1_1           5
#3     tg_4          -7

Explanation of \\\\**[a-z0-9]+_\\\\d :

  • Matches an optional asterisk: \\\\**
    • The end * means repeated 0 or more times. Could easily use ? for optional instead: \\\\*?
  • Followed by any lowercase letter, followed by any digit 0 to 9: [a-z0-9]
    • + one or more times: [a-z0-9]+
  • Followed by an underscore: _
  • Followed by a digit character: \\\\d

Edit: Update for new sample string with slightly different pattern and a possible constant at the end (eg +50 in the below example):

# new sample strings
b="+sl_1__1_1_1+5*from_st_1_4_1_1_1-70*into_st_4_1_1_1_1-sl_1__1_1_2"
#b="+sl_1__1_1_1+5*from_st_1_4_1_1_1-70*into_st_4_1_1_1_1-sl_1__1_1_2+50"
#b ="+sh_8_6_1_1_1-1000*sdp_8_6_1_1_1"
#b= "+fg_1+5*ug1_1-7*tg_4"              # 1st sample string from original question

Variable <- strsplit(b, "[+-][0-9]*\\**", perl=TRUE)[[1]]
Variable <- Variable[!Variable == ""]

Coefficient <- as.numeric(sub("([+-]$)", "\\11", 
                 strsplit(b, "(?<=[+-])\\D+.*?(?=[+-]|$)|(?<=\\d)\\*.*?(?=[+-]|$)",
                 perl=TRUE)[[1]]))

# handle possible constant at end of string:
ifelse(length(Coefficient) == (length(Variable)+1L), 
    df <- data.frame(Variable=c(Variable, "constant"), Coefficient), 
    df <- data.frame(Variable, Coefficient))
df
#           Variable Coefficient
#1       sl_1__1_1_1           1
#2 from_st_1_4_1_1_1           5
#3 into_st_4_1_1_1_1         -70
#4       sl_1__1_1_2          -1

Here's another

b <- "+sl_1__1_1_1+5*from_st_1_4_1_1_1-70*into_st_4_1_1_1_1-sl_1__1_1_2"
b1 <- "+sh_8_6_1_1_1-1000*sdp_8_6_1_1_1"
b2 <- "+sl_1__1_1_1+5*from_st_1_4_1_1_1-70*into_st_4_1_1_1_1-sl_1__1_1_2+50" 

First step, to make life easier, is to add in 1s when no coefficient is given, and then use the regex I mentioned in the comments.

(b <- gsub('([+-])(\\D)', '\\11+\\2', b2))
# [1] "+1+sl_1__1_1_1+5*from_st_1_4_1_1_1-70*into_st_4_1_1_1_1-1+sl_1__1_1_2"

(bb <- regmatches(b, gregexpr('[+-]?\\w+', text = b))[[1]])
# [1] "+1"                "+sl_1__1_1_1"      "+5"               
# [4] "from_st_1_4_1_1_1" "-70"               "into_st_4_1_1_1_1"
# [7] "-1"                "+sl_1__1_1_2"  

then do some final re-arranging and formatting

(bb <- data.frame(matrix(bb, ncol = 2, byrow = TRUE)[, 2:1]))
#                  X1  X2
# 1      +sl_1__1_1_1  +1
# 2 from_st_1_4_1_1_1  +5
# 3 into_st_4_1_1_1_1 -70
# 4      +sl_1__1_1_2  -1

within(bb, {
  X1 <- gsub('\\-|\\+', '', as.character(X1))
  X2 <- as.numeric(as.character(X2))
})

#                  X1  X2
# 1       sl_1__1_1_1   1
# 2 from_st_1_4_1_1_1   5
# 3 into_st_4_1_1_1_1 -70
# 4       sl_1__1_1_2  -1

This would be easier to use if you put it into a function. I also added a check for trailing constant terms

f <- function(x) {
  ## check constant
  x <- gsub('([+-]\\d+)$', '\\1*constant', x)
  x <- gsub('([+-])(\\D)', '\\11+\\2', x)
  x <- regmatches(x, gregexpr('[+-]?\\w+', text = x))[[1]]
  x <- data.frame(matrix(x, ncol = 2, byrow = TRUE)[, 2:1],
                  stringsAsFactors = FALSE)
  x[, 2] <- as.numeric(x[, 2])
  x[, 1] <- gsub('\\-|\\+', '' , x[, 1])
  setNames(x, c('Variable','Coefficient'))
}

f(b)

#            Variable Coefficient
# 1       sl_1__1_1_1           1
# 2 from_st_1_4_1_1_1           5
# 3 into_st_4_1_1_1_1         -70
# 4       sl_1__1_1_2          -1

f(b1)

#        Variable Coefficient
# 1  sh_8_6_1_1_1           1
# 2 sdp_8_6_1_1_1       -1000

f(b2)

#            Variable Coefficient
# 1       sl_1__1_1_1           1
# 2 from_st_1_4_1_1_1           5
# 3 into_st_4_1_1_1_1         -70
# 4       sl_1__1_1_2          -1
# 5          constant          50

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM