[英]Data frame R - lookup for the part of the string and return certain two values
我有一個帶有 xml 導出的數據框,我想查找 ceratin 字符串並從中返回兩個值。 以下是我的數據示例的代碼。
a <- c(
'<?xml version="1.0" encoding="UTF-8" standalone="yes"?><businessObjectChanges version="1"><table><datetime>1500977276442</datetime><name>XX_header</name><row><datetime>1500977276442</datetime><transactionType>UPDATE</transactionType><column><name>est_groundtime</name><newValue>420</newValue><oldValue>480</oldValue><mimeType>TIME</mimeType></column><column><name>start_time</name><newValue>540</newValue><oldValue>480</oldValue><mimeType>TIME</mimeType></column><column><name>XXno_i</name><primaryKey>true</primaryKey><newValue>11</newValue><oldValue>11</oldValue></column></row></table></businessObjectChanges>',
'<?xml version="1.0" encoding="UTF-8" standalone="yes"?><businessObjectChanges version="1"><table><datetime>1500977363880</datetime><name>XX_header</name><row><datetime>1500977363880</datetime><transactionType>UPDATE</transactionType><column><name>end_time</name><newValue>922</newValue><oldValue>960</oldValue><mimeType>TIME</mimeType></column><column><name>XXno_i</name><primaryKey>true</primaryKey><newValue>11</newValue><oldValue>11</oldValue></column></row></table></businessObjectChanges>',
'<?xml version="1.0" encoding="UTF-8" standalone="yes"?><businessObjectChanges version="1"><table><datetime>1500977598476</datetime><name>XX_header</name><row><datetime>1500977598476</datetime><transactionType>UPDATE</transactionType><column><name>act_start_date</name><newValue>16642</newValue><oldValue>null</oldValue><mimeType>DATE</mimeType></column><column><name>act_start_time</name><newValue>607</newValue><oldValue>0</oldValue><mimeType>TIME</mimeType></column><column><name>act_end_date</name><newValue>16642</newValue><oldValue>null</oldValue><mimeType>DATE</mimeType></column><column><name>act_end_time</name><newValue>667</newValue><oldValue>0</oldValue><mimeType>TIME</mimeType></column><column><name>delay</name><newValue>7</newValue><oldValue>0</oldValue><mimeType>TIME</mimeType></column><column><name>XXno_i</name><primaryKey>true</primaryKey><newValue>205</newValue><oldValue>205</oldValue></column></row></table></businessObjectChanges>',
'<?xml version="1.0" encoding="UTF-8" standalone="yes"?><businessObjectChanges version="1"><table><datetime>1500977613945</datetime><name>XX_header</name><row><datetime>1500977613945</datetime><transactionType>UPDATE</transactionType><column><name>XX_status</name><newValue>-2</newValue><oldValue>18</oldValue><mimeType>INT</mimeType></column><column><name>XXno_i</name><primaryKey>true</primaryKey><newValue>205</newValue><oldValue>205</oldValue></column></row></table></businessObjectChanges>',
'<?xml version="1.0" encoding="UTF-8" standalone="yes"?><businessObjectChanges version="1"><table><datetime>1500977892448</datetime><name>XX_header</name><row><datetime>1500977892448</datetime><transactionType>UPDATE</transactionType><column><name>XX_status</name><newValue>19</newValue><oldValue>-2</oldValue><mimeType>INT</mimeType></column><column><name>XXno_i</name><primaryKey>true</primaryKey><newValue>29</newValue><oldValue>29</oldValue></column></row></table></businessObjectChanges>',
'<?xml version="1.0" encoding="UTF-8" standalone="yes"?><businessObjectChanges version="1"><table><datetime>1500977738390</datetime><name>XX_header</name><row><datetime>1500977738390</datetime><transactionType>UPDATE</transactionType><column><name>act_start_date</name><newValue>16641</newValue><oldValue>null</oldValue><mimeType>DATE</mimeType></column><column><name>act_start_time</name><newValue>367</newValue><oldValue>0</oldValue><mimeType>TIME</mimeType></column><column><name>act_end_date</name><newValue>16641</newValue><oldValue>null</oldValue><mimeType>DATE</mimeType></column><column><name>act_end_time</name><newValue>667</newValue><oldValue>0</oldValue><mimeType>TIME</mimeType></column><column><name>XXno_i</name><primaryKey>true</primaryKey><newValue>44</newValue><oldValue>44</oldValue></column></row></table></businessObjectChanges>',
'<?xml version="1.0" encoding="UTF-8" standalone="yes"?><businessObjectChanges version="1"><table><datetime>1500977757374</datetime><name>XX_header</name><row><datetime>1500977757374</datetime><transactionType>UPDATE</transactionType><column><name>XX_status</name><newValue>19</newValue><oldValue>18</oldValue><mimeType>INT</mimeType></column><column><name>XXno_i</name><primaryKey>true</primaryKey><newValue>44</newValue><oldValue>44</oldValue></column></row></table></businessObjectChanges>',
'<?xml version="1.0" encoding="UTF-8" standalone="yes"?><businessObjectChanges version="1"><table><datetime>1500977836229</datetime><name>XX_header</name><row><datetime>1500977836229</datetime><transactionType>UPDATE</transactionType><column><name>act_start_date</name><newValue>16640</newValue><oldValue>null</oldValue><mimeType>DATE</mimeType></column><column><name>act_start_time</name><newValue>96</newValue><oldValue>0</oldValue><mimeType>TIME</mimeType></column><column><name>act_end_date</name><newValue>16640</newValue><oldValue>null</oldValue><mimeType>DATE</mimeType></column><column><name>act_end_time</name><newValue>156</newValue><oldValue>0</oldValue><mimeType>TIME</mimeType></column><column><name>XXno_i</name><primaryKey>true</primaryKey><newValue>203</newValue><oldValue>203</oldValue></column></row></table></businessObjectChanges>'
)
a <- as.data.frame(a)
我的數據框 'a' 需要的是兩列。 第一個是基於跟隨XX_status,第二個是再次跟隨XX_status。
因此,第 1 行到第 3 行不包含 XX_status,因此對它們有任何意義,第 4 行包含以下提取物,它將是 -2 和 18,對於第 5 行,它將是 18 和 2 等...
任何想法將不勝感激
這是處理這種情況的一種方法。 我嘗試首先使用stri_extract_all_regex()
提取長字符串的特定部分。 然后,我進一步嘗試在第二個stri_extract_all_regex()
提取數字。 然后我使用unnest_wider()
創建了兩列,並在最后為數據框分配了列名。 我希望這能幫到您。
library(tidyverse)
library(stringi)
transmute(a,
res = stri_extract_all_regex(str = a,
pattern = "XX_status</name><newValue>-?\\d+</newValue><oldValue>-?\\d+</oldValue>") %>%
stri_extract_all_regex(pattern = "(?<=>)-?\\d+(?=<)")) %>%
unnest_wider(res) %>%
setNames(nm = c("new_value", "old_value"))
# new_value old_value
# <chr> <chr>
#1 NA NA
#2 NA NA
#3 NA NA
#4 -2 18
#5 19 -2
#6 NA NA
#7 19 18
#8 NA NA
我們可以使用stringr
包和 REGEX 來提取這些值,如下所示。
library(dplyr)
library(stringr)
a <- c(...) # your XML string here
a <- as.data.frame(a)
a <- a
%>% mutate(
status = str_extract(a, "XX_status[\\<\\/.\\>\\w\\d\\-]+\\<\\/oldValue\\>\\
<mimeType\\>"),
newValue = str_extract(status, "newValue\\>[\\-]*\\d+"),
newValue = as.numeric(str_extract(newValue, "[\\-]*\\d+")),
oldValue = str_extract(status, "oldValue\\>[\\-]*\\d+"),
oldValue = as.numeric(str_extract(oldValue, "[\\-]*\\d+")))
%>% select(-status)
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.