简体   繁体   中英

R complex xml to data frame

I'm looking for a way to convert a highly complex xml file (is too long, so its bottom) to a table, obtained from official Property registry and storing about 20.000 buildings

The result must be a row for each "consulta_dnp" (each building), with those data in columns:

<pc1><pc2><car><cc1><cc2><np><nm><luso><sfc><cpt><ant>

Another problem are errors when data can't be retrieved. It's stored in that way:

<consulta_dnp>
  <control>
    <cuerr>1</cuerr>
  </control>
  <lerr>
    <err>
      <cod>4</cod>
      <des>error description</des>
    </err>
  </lerr>
</consulta_dnp>

I'm not interested in error codes, I just want a blank line, "error" or something else.

I've been working with answers to silimar questions, but I've not had luck.

That's the code I've work with

doc <- xmlParse("resultado_JA-.txt")

xml_len <- length(getNodeSet(doc,"//consulta_dnp"))

dflist <- lapply(seq(xml_len), function(i){   
  # PARENT NODES   
  d1 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/"))), key=1)
  # CHILD NODES
  d2 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/ibdi/rc/pc1"))), key=1) 
  d3 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/ibdi/rc/pc2"))), key=1) 
  d4 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/ibdi/rc/pc1"))), key=1) 
  d5 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/ibdi/rc/car"))), key=1) 
  d6 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/ibdi/rc/cc1"))), key=1) 
  d7 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/ibdi/rc/cc2"))), key=1) 
  d8 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/dt/np"))), key=1) 
  d9 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/dt/nm"))), key=1) 
  d10 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/ldt"))), key=1) 
  d11 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/debi/luso"))), key=1) 
  d12 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/debi/sfc"))), key=1) 
  d13 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/debi/cpt"))), key=1) 
  d14 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/debi/ant"))), key=1) 

  # MERGE ON KEY, THEN DROP KEY      
  merge(d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, by="key")[-1]    
})

xmldf_JA <- do.call(rbind, dflist)

This code counts the right occurrences for "consulta_dnp" but allways get stuck on this:

  aXPath error : Invalid expression
XPath error : Invalid expression
 Error in xpathApply.XMLInternalDocument(doc, path, fun, ..., namespaces = namespaces,  : 
  error evaluating xpath expression //consulta_dnp[1]/ 

Any help would be appreciate.

Here's the code (not real data, but it's real structure)

<Doc>
 <consulta_dnp>
  <control>
    <cudnp>1</cudnp>
    <cucons>1</cucons>
    <cucul>0</cucul>
  </control>
  <bico>
    <bi>
      <idbi>
        <cn>UR</cn>
        <rc>
          <pc1>0499418</pc1>
          <pc2>VG3709N</pc2>
          <car>0008</car>
          <cc1>R</cc1>
          <cc2>E</cc2>
        </rc>
      </idbi>
      <dt>
        <loine>
          <cp>23</cp>
          <cm>50</cm>
        </loine>
        <cmc>900</cmc>
        <np>VILLACONEJOS DE ARRIBA</np>
        <nm>MALAGA</nm>
        <locs>
          <lous>
            <lourb>
              <dir>
                <cv>799</cv>
                <tv>CL</tv>
                <nv>calle</nv>
                <pnp>2</pnp>
                <snp>0</snp>
              </dir>
              <loint>
                <es>1</es>
                <pt>01</pt>
                <pu>B</pu>
              </loint>
              <dp>29005</dp>
              <dm>1</dm>
            </lourb>
          </lous>
        </locs>
      </dt>
      <ldt>CL calle 2 Es:1 Pl:01 Pt:B 29005 Madrid (Madrid)</ldt>
      <debi>
        <luso>Residencial</luso>
        <sfc>72</sfc>
        <cpt>3,430000</cpt>
        <ant>1979</ant>
      </debi>
    </bi>
    <lcons>
      <cons>
        <lcd>VIVIENDA</lcd>
        <dt>
          <lourb>
            <loint>
              <es>1</es>
              <pt>01</pt>
              <pu>B</pu>
            </loint>
          </lourb>
        </dt>
        <dfcons>
          <stl>72</stl>
        </dfcons>
      </cons>
    </lcons>
  </bico>
</consulta_dnp>
</Doc>
library(xml2)
library(tidyverse)

I would try it with the following approach: read the data in with xml2 , create the expressions for extracting the elements of interest, then map over those and combine it to a data.frame.

# the structure of the document (code for data see below)
# I copied the code, so we have one entry, one error, and the first entry repeated
xml
#> {xml_document}
#> <Doc>
#> [1] <consulta_dnp>\n  <control>\n    <cudnp>1</cudnp>\n    <cucons>1</cu ...
#> [2] <consulta_dnp>\n  <control>\n    <cuerr>1</cuerr>\n  </control>\n  < ...
#> [3] <consulta_dnp>\n  <control>\n    <cudnp>1</cudnp>\n    <cucons>1</cu ...

# small helper for extracting the content
extract_child <- function(x, xpath) {
  xml_find_all(x, xpath) %>% 
    xml_text()
}

# our fields of interest
xpath_expressions <- c("pc1", "pc2", "car", "cc1", "cc2", "np", "nm", "luso", "sfc", 
                       "cpt", "ant")


xpath_expressions %>% 
  paste0(".//", .) %>% # search for the expressions from root
  map(~extract_child(xml, .x)) %>% 
  set_names(xpath_expressions) %>% 
  dplyr::bind_rows() %>% 
  type_convert(locale = locale(decimal_mark = ",")) 
#> # A tibble: 2 x 11
#>   pc1     pc2     car   cc1   cc2   np       nm    luso    sfc   cpt   ant
#>   <chr>   <chr>   <chr> <chr> <chr> <chr>    <chr> <chr> <int> <dbl> <int>
#> 1 0499418 VG3709N 0008  R     E     VILLACO… MALA… Resi…    72  3.43  1979
#> 2 0499418 VG3709N 0008  R     E     VILLACO… MALA… Resi…    72  3.43  1979

This approach "magically" works and the errors are no problem, since only those parts get extracted which we are interested in and there is no overlap between cases with errors and cases without. If you had entries, where some fields are missing but others are present, you would need to adapt the code. To elaborate: When the whole tag is missing, this approach breaks. When all tags are present but there is no content (eg <ant></ant> ), this results in a proper NA .

Update

The following code works even when there are missing elements and should run for your code.

extract_child <- function(x, xpath) {
  out <- xml_find_all(x, xpath) %>% 
    xml_text()

  if (is_empty(out)) out <- NA_character_

  out
}

# our fields of interest
xpath_expressions <- c("pc1", "pc2", "car", "cc1", "cc2", "np", "nm", "luso", "sfc", 
                       "cpt", "ant")



extract_part <- function(part) {
  xpath_expressions %>% 
    paste0(".//", .) %>% # search for the expressions from root
    map(~extract_child(part, .x)) %>% 
    set_names(xpath_expressions) %>% 
    keep(~any(!is.na(.))) %>% 
    dplyr::bind_rows() %>% 
    type_convert(locale = locale(decimal_mark = ",")) 
}


xml %>% 
  xml_children() %>% 
  map_df(extract_part)

Data

   xml <- read_xml("<Doc>
     <consulta_dnp>
    <control>
    <cudnp>1</cudnp>
    <cucons>1</cucons>
    <cucul>0</cucul>
    </control>
    <bico>
    <bi>
    <idbi>
    <cn>UR</cn>
    <rc>
    <pc1>0499418</pc1>
    <pc2>VG3709N</pc2>
    <car>0008</car>
    <cc1>R</cc1>
    <cc2>E</cc2>
    </rc>
    </idbi>
    <dt>
    <loine>
    <cp>23</cp>
    <cm>50</cm>
    </loine>
    <cmc>900</cmc>
    <np>VILLACONEJOS DE ARRIBA</np>
    <nm>MALAGA</nm>
    <locs>
    <lous>
    <lourb>
    <dir>
    <cv>799</cv>
    <tv>CL</tv>
    <nv>calle</nv>
    <pnp>2</pnp>
    <snp>0</snp>
    </dir>
    <loint>
    <es>1</es>
    <pt>01</pt>
    <pu>B</pu>
    </loint>
    <dp>29005</dp>
    <dm>1</dm>
    </lourb>
    </lous>
    </locs>
    </dt>
    <ldt>CL calle 2 Es:1 Pl:01 Pt:B 29005 Madrid (Madrid)</ldt>
    <debi>
    <luso>Residencial</luso>
    <sfc>72</sfc>
    <cpt>3,430000</cpt>
    <ant>1979</ant>
    </debi>
    </bi>
    <lcons>
    <cons>
    <lcd>VIVIENDA</lcd>
    <dt>
    <lourb>
    <loint>
    <es>1</es>
    <pt>01</pt>
    <pu>B</pu>
    </loint>
    </lourb>
    </dt>
    <dfcons>
    <stl>72</stl>
    </dfcons>
    </cons>
    </lcons>
    </bico>
    </consulta_dnp>
    <consulta_dnp>
      <control>
                    <cuerr>1</cuerr>
                    </control>
                    <lerr>
                    <err>
                    <cod>4</cod>
                    <des>error description</des>
                    </err>
                    </lerr>
                    </consulta_dnp>
     <consulta_dnp>
    <control>
                    <cudnp>1</cudnp>
                    <cucons>1</cucons>
                    <cucul>0</cucul>
                    </control>
                    <bico>
                    <bi>
                    <idbi>
                    <cn>UR</cn>
                    <rc>
                    <pc1>0499418</pc1>
                    <pc2>VG3709N</pc2>
                    <car>0008</car>
                    <cc1>R</cc1>
                    <cc2>E</cc2>
                    </rc>
                    </idbi>
                    <dt>
                    <loine>
                    <cp>23</cp>
                    <cm>50</cm>
                    </loine>
                    <cmc>900</cmc>
                    <np>VILLACONEJOS DE ARRIBA</np>
                    <nm>MALAGA</nm>
                    <locs>
                    <lous>
                    <lourb>
                    <dir>
                    <cv>799</cv>
                    <tv>CL</tv>
                    <nv>calle</nv>
                    <pnp>2</pnp>
                    <snp>0</snp>
                    </dir>
                    <loint>
                    <es>1</es>
                    <pt>01</pt>
                    <pu>B</pu>
                    </loint>
                    <dp>29005</dp>
                    <dm>1</dm>
                    </lourb>
                    </lous>
                    </locs>
                    </dt>
                    <ldt>CL calle 2 Es:1 Pl:01 Pt:B 29005 Madrid (Madrid)</ldt>
                    <debi>
                    <luso>Residencial</luso>
                    <sfc>72</sfc>
                    <cpt>3,430000</cpt>
                    <ant>1979</ant>
                    </debi>
                    </bi>
                    <lcons>
                    <cons>
                    <lcd>VIVIENDA</lcd>
                    <dt>
                    <lourb>
                    <loint>
                    <es>1</es>
                    <pt>01</pt>
                    <pu>B</pu>
                    </loint>
                    </lourb>
                    </dt>
                    <dfcons>
                    <stl>72</stl>
                    </dfcons>
                    </cons>
                    </lcons>
                    </bico>
                    </consulta_dnp>
    </Doc>")

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM