簡體   English   中英

數據框 R - 查找字符串的一部分並返回某些兩個值

[英]Data frame R - lookup for the part of the string and return certain two values

我有一個帶有 xml 導出的數據框,我想查找 ceratin 字符串並從中返回兩個值。 以下是我的數據示例的代碼。

a <- c(
  '<?xml version="1.0" encoding="UTF-8" standalone="yes"?><businessObjectChanges version="1"><table><datetime>1500977276442</datetime><name>XX_header</name><row><datetime>1500977276442</datetime><transactionType>UPDATE</transactionType><column><name>est_groundtime</name><newValue>420</newValue><oldValue>480</oldValue><mimeType>TIME</mimeType></column><column><name>start_time</name><newValue>540</newValue><oldValue>480</oldValue><mimeType>TIME</mimeType></column><column><name>XXno_i</name><primaryKey>true</primaryKey><newValue>11</newValue><oldValue>11</oldValue></column></row></table></businessObjectChanges>',
   '<?xml version="1.0" encoding="UTF-8" standalone="yes"?><businessObjectChanges version="1"><table><datetime>1500977363880</datetime><name>XX_header</name><row><datetime>1500977363880</datetime><transactionType>UPDATE</transactionType><column><name>end_time</name><newValue>922</newValue><oldValue>960</oldValue><mimeType>TIME</mimeType></column><column><name>XXno_i</name><primaryKey>true</primaryKey><newValue>11</newValue><oldValue>11</oldValue></column></row></table></businessObjectChanges>',
    '<?xml version="1.0" encoding="UTF-8" standalone="yes"?><businessObjectChanges version="1"><table><datetime>1500977598476</datetime><name>XX_header</name><row><datetime>1500977598476</datetime><transactionType>UPDATE</transactionType><column><name>act_start_date</name><newValue>16642</newValue><oldValue>null</oldValue><mimeType>DATE</mimeType></column><column><name>act_start_time</name><newValue>607</newValue><oldValue>0</oldValue><mimeType>TIME</mimeType></column><column><name>act_end_date</name><newValue>16642</newValue><oldValue>null</oldValue><mimeType>DATE</mimeType></column><column><name>act_end_time</name><newValue>667</newValue><oldValue>0</oldValue><mimeType>TIME</mimeType></column><column><name>delay</name><newValue>7</newValue><oldValue>0</oldValue><mimeType>TIME</mimeType></column><column><name>XXno_i</name><primaryKey>true</primaryKey><newValue>205</newValue><oldValue>205</oldValue></column></row></table></businessObjectChanges>',
    '<?xml version="1.0" encoding="UTF-8" standalone="yes"?><businessObjectChanges version="1"><table><datetime>1500977613945</datetime><name>XX_header</name><row><datetime>1500977613945</datetime><transactionType>UPDATE</transactionType><column><name>XX_status</name><newValue>-2</newValue><oldValue>18</oldValue><mimeType>INT</mimeType></column><column><name>XXno_i</name><primaryKey>true</primaryKey><newValue>205</newValue><oldValue>205</oldValue></column></row></table></businessObjectChanges>',
    '<?xml version="1.0" encoding="UTF-8" standalone="yes"?><businessObjectChanges version="1"><table><datetime>1500977892448</datetime><name>XX_header</name><row><datetime>1500977892448</datetime><transactionType>UPDATE</transactionType><column><name>XX_status</name><newValue>19</newValue><oldValue>-2</oldValue><mimeType>INT</mimeType></column><column><name>XXno_i</name><primaryKey>true</primaryKey><newValue>29</newValue><oldValue>29</oldValue></column></row></table></businessObjectChanges>',
    '<?xml version="1.0" encoding="UTF-8" standalone="yes"?><businessObjectChanges version="1"><table><datetime>1500977738390</datetime><name>XX_header</name><row><datetime>1500977738390</datetime><transactionType>UPDATE</transactionType><column><name>act_start_date</name><newValue>16641</newValue><oldValue>null</oldValue><mimeType>DATE</mimeType></column><column><name>act_start_time</name><newValue>367</newValue><oldValue>0</oldValue><mimeType>TIME</mimeType></column><column><name>act_end_date</name><newValue>16641</newValue><oldValue>null</oldValue><mimeType>DATE</mimeType></column><column><name>act_end_time</name><newValue>667</newValue><oldValue>0</oldValue><mimeType>TIME</mimeType></column><column><name>XXno_i</name><primaryKey>true</primaryKey><newValue>44</newValue><oldValue>44</oldValue></column></row></table></businessObjectChanges>',
    '<?xml version="1.0" encoding="UTF-8" standalone="yes"?><businessObjectChanges version="1"><table><datetime>1500977757374</datetime><name>XX_header</name><row><datetime>1500977757374</datetime><transactionType>UPDATE</transactionType><column><name>XX_status</name><newValue>19</newValue><oldValue>18</oldValue><mimeType>INT</mimeType></column><column><name>XXno_i</name><primaryKey>true</primaryKey><newValue>44</newValue><oldValue>44</oldValue></column></row></table></businessObjectChanges>',
    '<?xml version="1.0" encoding="UTF-8" standalone="yes"?><businessObjectChanges version="1"><table><datetime>1500977836229</datetime><name>XX_header</name><row><datetime>1500977836229</datetime><transactionType>UPDATE</transactionType><column><name>act_start_date</name><newValue>16640</newValue><oldValue>null</oldValue><mimeType>DATE</mimeType></column><column><name>act_start_time</name><newValue>96</newValue><oldValue>0</oldValue><mimeType>TIME</mimeType></column><column><name>act_end_date</name><newValue>16640</newValue><oldValue>null</oldValue><mimeType>DATE</mimeType></column><column><name>act_end_time</name><newValue>156</newValue><oldValue>0</oldValue><mimeType>TIME</mimeType></column><column><name>XXno_i</name><primaryKey>true</primaryKey><newValue>203</newValue><oldValue>203</oldValue></column></row></table></businessObjectChanges>'

)

a <- as.data.frame(a)

我的數據框 'a' 需要的是兩列。 第一個是基於跟隨XX_status,第二個是再次跟隨XX_status。

因此,第 1 行到第 3 行不包含 XX_status,因此對它們有任何意義,第 4 行包含以下提取物,它將是 -2 和 18,對於第 5 行,它將是 18 和 2 等...

任何想法將不勝感激

這是處理這種情況的一種方法。 我嘗試首先使用stri_extract_all_regex()提取長字符串的特定部分。 然后,我進一步嘗試在第二個stri_extract_all_regex()提取數字。 然后我使用unnest_wider()創建了兩列,並在最后為數據框分配了列名。 我希望這能幫到您。

library(tidyverse)
library(stringi)

transmute(a,
          res = stri_extract_all_regex(str = a,
                                       pattern = "XX_status</name><newValue>-?\\d+</newValue><oldValue>-?\\d+</oldValue>") %>% 
                stri_extract_all_regex(pattern = "(?<=>)-?\\d+(?=<)")) %>% 
unnest_wider(res) %>% 
setNames(nm = c("new_value", "old_value"))

#  new_value old_value
#  <chr>     <chr>    
#1 NA        NA       
#2 NA        NA       
#3 NA        NA       
#4 -2        18       
#5 19        -2       
#6 NA        NA       
#7 19        18       
#8 NA        NA  

我們可以使用stringr包和 REGEX 來提取這些值,如下所示。

library(dplyr)
library(stringr)

a <- c(...) # your XML string here

a <- as.data.frame(a)

a <-  a 
%>% mutate(
status = str_extract(a, "XX_status[\\<\\/.\\>\\w\\d\\-]+\\<\\/oldValue\\>\\ 
<mimeType\\>"),
newValue = str_extract(status, "newValue\\>[\\-]*\\d+"),
newValue = as.numeric(str_extract(newValue, "[\\-]*\\d+")),
oldValue = str_extract(status, "oldValue\\>[\\-]*\\d+"),
oldValue = as.numeric(str_extract(oldValue, "[\\-]*\\d+"))) 
%>% select(-status)

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM