簡體   English   中英

R 正則表達式提取 substring 后跟行尾或特定字符(惰性匹配)

[英]R regex extract substring followed by end of line or specific character (lazy match)

我有一個向量mystr ,其元素包含給定參數的度量單位 - 這由UOM=后面的字母、符號等表示。 這可以放在字符串的末尾或用分號分隔;

c("\\\\Server-01?6cf038ea-d583-4860-9488-67ee59c767c2\\expnum.2PDT35103?6438;TimeMethod=AtOrBefore;UOM=inHg;pointtype=Float32;displaydigits=1", 
"\\\\Server02-01?6cf038ea-d583-4860-9488-67ee59c767c2\\testnum.2BTAVGBARPR.OUT?6449;TimeMethod=AtOrBefore;UOM=inHg", 
"\\\\Server02-01?6cf038ea-d583-4860-9488-67ee59c767c2\\testnum3.2PT39248S.XQ01?6453;TimeMethod=AtOrBefore;UOM=psia;pointtype=Float32;displaydigits=1")

在上面的例子中,我想分別提取inHginHgpsia 到目前為止,我已經嘗試使用regmatchesregexec ,但沒有找到適用於所有三個示例的任何內容:

regex_func <- function(string, ptrn){
  return(regmatches(x = string, m = regexec(pattern = ptrn, text = string))[[1]][2])
}

> sapply(mystr, function(z){ regex_func(string = z, ptrn = '.*UOM=(.*)[$;]?')}, USE.NAMES = F)
[1] "inHg;pointtype=Float32;displaydigits=1" "inHg"                                  
[3] "psia;pointtype=Float32;displaydigits=1"

> sapply(mystr, function(z){ regex_func(string = z, ptrn = '.*UOM=(.*)[$;]+?')}, USE.NAMES = F)
[1] "inHg" NA     "psia"

> sapply(mystr, function(z){ regex_func(string = z, ptrn = '.*UOM=(.*)[$;]')}, USE.NAMES = F)
[1] "inHg;pointtype=Float32" NA                       "psia;pointtype=Float32"

> sapply(mystr, function(z){ regex_func(string = z, ptrn = '.*UOM=(.*)[$;]{0,1}')}, USE.NAMES = F)
[1] "inHg;pointtype=Float32;displaydigits=1" "inHg"                                  
[3] "psia;pointtype=Float32;displaydigits=1"

我不依賴於regmatches ,也願意使用其他函數/包,例如stringrstringi

編輯

添加。 包含所有信息的示例數據幀 - 並非所有ConfigString元素都有UOM

structure(list(Name = c("Ambient Pressure", "Ambient RH", "Ambient Temperature", 
"Average Exhaust Gas Temp", "Bellmouth Temperature", "Compressor Discharge Pressure", 
"Compressor Discharge Temperature", "Current Power Output", "Degradation in Heat Rate (Comp Effic)", 
"Degradation in Power Output (Comp Effic)", "DirtPenetratingToEngineSinceLastWash", 
"Fuel Gas Temperature", "Fuel Heating Value (by volume)", "Fuel Volumetric Flow", 
"GT Fired Hours", "HRSG HP Steam Outlet Mass Flow", "HRSG HP Steam Outlet Pressure", 
"HRSG HP Steam Outlet Temperature", "HRSG IP Steam Outlet Mass Flow", 
"HRSG IP Steam Outlet Pressure", "HRSG IP Steam Outlet Temperature", 
"HRSG LP Steam Outlet Mass Flow", "HRSG LP Steam Outlet Pressure", 
"HRSG LP Steam Outlet Temperature", "Inlet Guide Vane Position", 
"Inlet system pressure drop", "Steam Injection Flow", "Steam Injection Pressure", 
"Steam Injection Temp"), DefaultUnitsName = c("kilopascal", "percent", 
"degree Celsius", "degree Celsius", "degree Celsius", "kilopascal", 
"degree Celsius", "megawatt", "kilojoule per kilowatt-hour", 
"megawatt", "gram", "degree Celsius", "BTU per standard cubic foot", 
"standard cubic foot per second", "hour", "kilogram per second", 
"kilopascal", "degree Celsius", "kilogram per second", "kilopascal", 
"degree Celsius", "kilogram per second", "kilopascal", "degree Celsius", 
"degree", "kilopascal", "kilogram per second", "pound-force per square inch", 
"degree Celsius"), DefaultUnitsNameAbbreviation = c("kPa", "%", 
"°C", "°C", "°C", "kPa", "°C", "MW", "kJ/kWh", "MW", "g", "°C", 
"BTU/scf", "scfs", "h", "kg/s", "kPa", "°C", "kg/s", "kPa", "°C", 
"kg/s", "kPa", "°C", "°", "kPa", "kg/s", "psi", "°C"), ConfigString = c("\\\\#\\asset1.2BTAVGBARPR.OUT?6449;TimeMethod=AtOrBefore;UOM=inHg", 
"\\\\#\\asset1.2BTAVGHUM.OUT?6423;TimeMethod=AtOrBefore", "\\\\#\\asset1.2BTAVGAMBTEMP.OUT?6446;TimeMethod=AtOrBefore;UOM=°F", 
"\\\\#\\asset1.2TEAVTX.ZQ01?6456;TimeMethod=AtOrBefore;UOM=°F", 
"\\\\#\\asset1.BT0110.CTGgtAIte01a?6802;TimeMethod=AtOrBefore;UOM=°F;pointtype=Float32;displaydigits=1", 
"\\\\#\\asset1.2PT39248S.XQ01?6453;TimeMethod=AtOrBefore;UOM=psia;pointtype=Float32;displaydigits=1", 
"\\\\#\\asset1.2TE35401S.XQ02?6457;TimeMethod=AtOrBefore;UOM=°F;pointtype=Float32;displaydigits=1", 
"\\\\#\\asset1.2JT38601S.XQ01?6450;TimeMethod=AtOrBefore;pointtype=Float32;displaydigits=1", 
"\\\\#\\Degradation in Heat Rate (Comp Effic)?6275;TimeMethod=AtOrBefore", 
"\\\\#\\Degradation in Power Output (Comp Effic)?6274;TimeMethod=AtOrBefore", 
"\\\\#\\Dust?6273;TimeMethod=AtOrBefore", 
"\\\\#\\asset1.2TE36112.XQ01?6454;TimeMethod=AtOrBefore;UOM=°F;pointtype=Float32;displaydigits=1", 
"\\\\#\\asset1.2FC54SUM.XQ01?6448;TimeMethod=AtOrBefore;pointtype=Float32;displaydigits=1", 
"\\\\#\\asset1.BT0110.CTGgtFGvl01a?6801;TimeMethod=AtOrBefore;pointtype=Float32;displaydigits=1", 
"\\\\#\\asset1.2CTGFiredHours?6800;TimeMethod=AtOrBefore;pointtype=Float32;displaydigits=1", 
"\\\\#\\asset1.2FT5050S.XQ01?6455;TimeMethod=AtOrBefore;pointtype=Float32;displaydigits=1", 
"\\\\#\\asset1.2PT5000S.XQ01?6799;TimeMethod=AtOrBefore;UOM=psi;pointtype=Float32;displaydigits=1", 
"\\\\#\\asset1.2TE5020S.XQ01?6798;TimeMethod=AtOrBefore;UOM=°F;pointtype=Float32;displaydigits=1", 
"\\\\#\\asset1.2FT5150S.XQ01?6447;TimeMethod=AtOrBefore;pointtype=Float32;displaydigits=1", 
"\\\\#\\asset1.2PT5100S.XQ01?6797;UOM=psi;pointtype=Float32;displaydigits=1", 
"\\\\#\\asset1.2TE5120S.XQ01?6796;TimeMethod=AtOrBefore;UOM=°F;pointtype=Float32;displaydigits=1", 
"\\\\#\\asset1.2FT5250S.XQ01?6443;TimeMethod=AtOrBefore;pointtype=Float32;displaydigits=1", 
"\\\\#\\asset1.2PT5200S.XQ01?6795;TimeMethod=AtOrBefore;UOM=psi;pointtype=Float32;displaydigits=1", 
"\\\\#\\asset1.2TE5220S.XQ01?6794;TimeMethod=AtOrBefore;UOM=°F;pointtype=Float32;displaydigits=1", 
"\\\\#\\asset1.2ZT35203.XQ01?6432;TimeMethod=AtOrBefore;pointtype=Float32;displaydigits=1", 
"\\\\#\\asset1.2PDT35103?6438;TimeMethod=AtOrBefore;UOM=inHg;pointtype=Float32;displaydigits=1", 
"\\\\#\\asset1.2FT36602X.ZQ01?6792;TimeMethod=AtOrBefore;pointtype=Float32;displaydigits=1", 
"\\\\#\\asset1.2PT245.XQ01?6793;TimeMethod=AtOrBefore;pointtype=Float32;displaydigits=1", 
"\\\\#\\asset1.2TE240.XQ01?6791;TimeMethod=AtOrBefore;UOM=°F;pointtype=Float32;displaydigits=1"
)), row.names = c(NA, -29L), class = c("data.table", "data.frame"
))

# 15 items returned 
> regmatches(x = dat$ConfigString, regexpr(pattern = '[?;]UOM=\\K[^;]+', text = dat$ConfigString, perl = T))
 [1] "inHg" "°F"   "°F"   "°F"   "psia" "°F"   "°F"   "psi"  "°F"   "psi"  "°F"   "psi"  "°F"   "inHg" "°F"

將所選解決方案與此數據一起使用:

# operation on a vector
> dat[, uom := regmatches(ConfigString, regexpr(pattern = '[?;]UOM=\\K[^;]+', ConfigString, perl = T))]

# using := operator in data.table
> dat[, uom := regmatches(ConfigString, regexpr(pattern = '[?;]UOM=\\K[^;]+', ConfigString, perl = T))]
Error in `[.data.table`(dat, , `:=`(uom, regmatches(ConfigString, regexpr(pattern = "[?;]UOM=\\K[^;]+",  : 
  Supplied 15 items to be assigned to 29 items of column 'uom'. If you wish to 'recycle' the RHS please use rep() to make this intent clear to readers of your code.

使用stringr

> stringr::str_replace(dat$ConfigString, ".*[?;]UOM=([^;]+).*", "\\1")
 [1] "inHg"                                                                                          
 [2] "\\\\#\\asset1.2BTAVGHUM.OUT?6423;TimeMethod=AtOrBefore"                                        
 [3] "°F"                                                                                            
 [4] "°F"                                                                                            
 [5] "°F"                                                                                            
 [6] "psia"                                                                                          
 [7] "°F"                                                                                            
 [8] "\\\\#\\asset1.2JT38601S.XQ01?6450;TimeMethod=AtOrBefore;pointtype=Float32;displaydigits=1"     
 [9] "\\\\#\\Degradation in Heat Rate (Comp Effic)?6275;TimeMethod=AtOrBefore"                       
[10] "\\\\#\\Degradation in Power Output (Comp Effic)?6274;TimeMethod=AtOrBefore"                    
[11] "\\\\#\\Dust?6273;TimeMethod=AtOrBefore"                                                        
[12] "°F"                                                                                            
[13] "\\\\#\\asset1.2FC54SUM.XQ01?6448;TimeMethod=AtOrBefore;pointtype=Float32;displaydigits=1"      
[14] "\\\\#\\asset1.BT0110.CTGgtFGvl01a?6801;TimeMethod=AtOrBefore;pointtype=Float32;displaydigits=1"
[15] "\\\\#\\asset1.2CTGFiredHours?6800;TimeMethod=AtOrBefore;pointtype=Float32;displaydigits=1"     
[16] "\\\\#\\asset1.2FT5050S.XQ01?6455;TimeMethod=AtOrBefore;pointtype=Float32;displaydigits=1"      
[17] "psi"                                                                                           
[18] "°F"                                                                                            
[19] "\\\\#\\asset1.2FT5150S.XQ01?6447;TimeMethod=AtOrBefore;pointtype=Float32;displaydigits=1"      
[20] "psi"                                                                                           
[21] "°F"                                                                                            
[22] "\\\\#\\asset1.2FT5250S.XQ01?6443;TimeMethod=AtOrBefore;pointtype=Float32;displaydigits=1"      
[23] "psi"                                                                                           
[24] "°F"                                                                                            
[25] "\\\\#\\asset1.2ZT35203.XQ01?6432;TimeMethod=AtOrBefore;pointtype=Float32;displaydigits=1"      
[26] "inHg"                                                                                          
[27] "\\\\#\\asset1.2FT36602X.ZQ01?6792;TimeMethod=AtOrBefore;pointtype=Float32;displaydigits=1"     
[28] "\\\\#\\asset1.2PT245.XQ01?6793;TimeMethod=AtOrBefore;pointtype=Float32;displaydigits=1"        
[29] "°F"  

您可以使用以下基本 R PCRE 正則表達式解決方案:

[?;]UOM=\K[^;]+

或者,像這樣的stringr解決方案

library(stringr)
str_match(x, "[?;]UOM=([^;]+)")[,2]

請參閱正則表達式演示 詳情

  • [?;] - 一個? ; 字符
  • 計量UOM= - 計量UOM= substring
  • \K -匹配重置運算符
  • [^;]+ - 一個或多個除;以外的字符字符。

請參閱R 演示

x <- c("\\\\Server-01?6cf038ea-d583-4860-9488-67ee59c767c2\\expnum.2PDT35103?6438;TimeMethod=AtOrBefore;UOM=inHg;pointtype=Float32;displaydigits=1", 
"\\\\Server02-01?6cf038ea-d583-4860-9488-67ee59c767c2\\testnum.2BTAVGBARPR.OUT?6449;TimeMethod=AtOrBefore;UOM=inHg", 
"\\\\Server02-01?6cf038ea-d583-4860-9488-67ee59c767c2\\testnum3.2PT39248S.XQ01?6453;TimeMethod=AtOrBefore;UOM=psia;pointtype=Float32;displaydigits=1",
"\\\\Server02-01?6cf038ea-d583-4860-9488-67ee59c767c2\\testnum.2TE36112.XQ01?6454;TimeMethod=AtOrBefore;UOM=°F;pointtype=Float32;displaydigits=1")
regmatches(x, regexpr("[?;]UOM=\\K[^;]+", x, perl=TRUE))
## => [1] "inHg" "inHg" "psia" "°F"  
library(stringr)
str_match(x, "[?;]UOM=([^;]+)")[,2]
## => [1] "inHg" "inHg" "psia" "°F"  

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM