I'm looking for a way to convert a highly complex xml file (is too long, so its bottom) to a table, obtained from official Property registry and storing about 20.000 buildings
The result must be a row for each "consulta_dnp" (each building), with those data in columns:
<pc1><pc2><car><cc1><cc2><np><nm><luso><sfc><cpt><ant>
Another problem are errors when data can't be retrieved. It's stored in that way:
<consulta_dnp>
<control>
<cuerr>1</cuerr>
</control>
<lerr>
<err>
<cod>4</cod>
<des>error description</des>
</err>
</lerr>
</consulta_dnp>
I'm not interested in error codes, I just want a blank line, "error" or something else.
I've been working with answers to silimar questions, but I've not had luck.
That's the code I've work with
doc <- xmlParse("resultado_JA-.txt")
xml_len <- length(getNodeSet(doc,"//consulta_dnp"))
dflist <- lapply(seq(xml_len), function(i){
# PARENT NODES
d1 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/"))), key=1)
# CHILD NODES
d2 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/ibdi/rc/pc1"))), key=1)
d3 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/ibdi/rc/pc2"))), key=1)
d4 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/ibdi/rc/pc1"))), key=1)
d5 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/ibdi/rc/car"))), key=1)
d6 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/ibdi/rc/cc1"))), key=1)
d7 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/ibdi/rc/cc2"))), key=1)
d8 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/dt/np"))), key=1)
d9 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/dt/nm"))), key=1)
d10 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/ldt"))), key=1)
d11 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/debi/luso"))), key=1)
d12 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/debi/sfc"))), key=1)
d13 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/debi/cpt"))), key=1)
d14 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/debi/ant"))), key=1)
# MERGE ON KEY, THEN DROP KEY
merge(d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, by="key")[-1]
})
xmldf_JA <- do.call(rbind, dflist)
This code counts the right occurrences for "consulta_dnp" but allways get stuck on this:
aXPath error : Invalid expression
XPath error : Invalid expression
Error in xpathApply.XMLInternalDocument(doc, path, fun, ..., namespaces = namespaces, :
error evaluating xpath expression //consulta_dnp[1]/
Any help would be appreciate.
Here's the code (not real data, but it's real structure)
<Doc>
<consulta_dnp>
<control>
<cudnp>1</cudnp>
<cucons>1</cucons>
<cucul>0</cucul>
</control>
<bico>
<bi>
<idbi>
<cn>UR</cn>
<rc>
<pc1>0499418</pc1>
<pc2>VG3709N</pc2>
<car>0008</car>
<cc1>R</cc1>
<cc2>E</cc2>
</rc>
</idbi>
<dt>
<loine>
<cp>23</cp>
<cm>50</cm>
</loine>
<cmc>900</cmc>
<np>VILLACONEJOS DE ARRIBA</np>
<nm>MALAGA</nm>
<locs>
<lous>
<lourb>
<dir>
<cv>799</cv>
<tv>CL</tv>
<nv>calle</nv>
<pnp>2</pnp>
<snp>0</snp>
</dir>
<loint>
<es>1</es>
<pt>01</pt>
<pu>B</pu>
</loint>
<dp>29005</dp>
<dm>1</dm>
</lourb>
</lous>
</locs>
</dt>
<ldt>CL calle 2 Es:1 Pl:01 Pt:B 29005 Madrid (Madrid)</ldt>
<debi>
<luso>Residencial</luso>
<sfc>72</sfc>
<cpt>3,430000</cpt>
<ant>1979</ant>
</debi>
</bi>
<lcons>
<cons>
<lcd>VIVIENDA</lcd>
<dt>
<lourb>
<loint>
<es>1</es>
<pt>01</pt>
<pu>B</pu>
</loint>
</lourb>
</dt>
<dfcons>
<stl>72</stl>
</dfcons>
</cons>
</lcons>
</bico>
</consulta_dnp>
</Doc>
library(xml2)
library(tidyverse)
I would try it with the following approach: read the data in with xml2
, create the expressions for extracting the elements of interest, then map over those and combine it to a data.frame.
# the structure of the document (code for data see below)
# I copied the code, so we have one entry, one error, and the first entry repeated
xml
#> {xml_document}
#> <Doc>
#> [1] <consulta_dnp>\n <control>\n <cudnp>1</cudnp>\n <cucons>1</cu ...
#> [2] <consulta_dnp>\n <control>\n <cuerr>1</cuerr>\n </control>\n < ...
#> [3] <consulta_dnp>\n <control>\n <cudnp>1</cudnp>\n <cucons>1</cu ...
# small helper for extracting the content
extract_child <- function(x, xpath) {
xml_find_all(x, xpath) %>%
xml_text()
}
# our fields of interest
xpath_expressions <- c("pc1", "pc2", "car", "cc1", "cc2", "np", "nm", "luso", "sfc",
"cpt", "ant")
xpath_expressions %>%
paste0(".//", .) %>% # search for the expressions from root
map(~extract_child(xml, .x)) %>%
set_names(xpath_expressions) %>%
dplyr::bind_rows() %>%
type_convert(locale = locale(decimal_mark = ","))
#> # A tibble: 2 x 11
#> pc1 pc2 car cc1 cc2 np nm luso sfc cpt ant
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <int> <dbl> <int>
#> 1 0499418 VG3709N 0008 R E VILLACO… MALA… Resi… 72 3.43 1979
#> 2 0499418 VG3709N 0008 R E VILLACO… MALA… Resi… 72 3.43 1979
This approach "magically" works and the errors are no problem, since only those parts get extracted which we are interested in and there is no overlap between cases with errors and cases without. If you had entries, where some fields are missing but others are present, you would need to adapt the code. To elaborate: When the whole tag is missing, this approach breaks. When all tags are present but there is no content (eg <ant></ant>
), this results in a proper NA
.
The following code works even when there are missing elements and should run for your code.
extract_child <- function(x, xpath) {
out <- xml_find_all(x, xpath) %>%
xml_text()
if (is_empty(out)) out <- NA_character_
out
}
# our fields of interest
xpath_expressions <- c("pc1", "pc2", "car", "cc1", "cc2", "np", "nm", "luso", "sfc",
"cpt", "ant")
extract_part <- function(part) {
xpath_expressions %>%
paste0(".//", .) %>% # search for the expressions from root
map(~extract_child(part, .x)) %>%
set_names(xpath_expressions) %>%
keep(~any(!is.na(.))) %>%
dplyr::bind_rows() %>%
type_convert(locale = locale(decimal_mark = ","))
}
xml %>%
xml_children() %>%
map_df(extract_part)
xml <- read_xml("<Doc>
<consulta_dnp>
<control>
<cudnp>1</cudnp>
<cucons>1</cucons>
<cucul>0</cucul>
</control>
<bico>
<bi>
<idbi>
<cn>UR</cn>
<rc>
<pc1>0499418</pc1>
<pc2>VG3709N</pc2>
<car>0008</car>
<cc1>R</cc1>
<cc2>E</cc2>
</rc>
</idbi>
<dt>
<loine>
<cp>23</cp>
<cm>50</cm>
</loine>
<cmc>900</cmc>
<np>VILLACONEJOS DE ARRIBA</np>
<nm>MALAGA</nm>
<locs>
<lous>
<lourb>
<dir>
<cv>799</cv>
<tv>CL</tv>
<nv>calle</nv>
<pnp>2</pnp>
<snp>0</snp>
</dir>
<loint>
<es>1</es>
<pt>01</pt>
<pu>B</pu>
</loint>
<dp>29005</dp>
<dm>1</dm>
</lourb>
</lous>
</locs>
</dt>
<ldt>CL calle 2 Es:1 Pl:01 Pt:B 29005 Madrid (Madrid)</ldt>
<debi>
<luso>Residencial</luso>
<sfc>72</sfc>
<cpt>3,430000</cpt>
<ant>1979</ant>
</debi>
</bi>
<lcons>
<cons>
<lcd>VIVIENDA</lcd>
<dt>
<lourb>
<loint>
<es>1</es>
<pt>01</pt>
<pu>B</pu>
</loint>
</lourb>
</dt>
<dfcons>
<stl>72</stl>
</dfcons>
</cons>
</lcons>
</bico>
</consulta_dnp>
<consulta_dnp>
<control>
<cuerr>1</cuerr>
</control>
<lerr>
<err>
<cod>4</cod>
<des>error description</des>
</err>
</lerr>
</consulta_dnp>
<consulta_dnp>
<control>
<cudnp>1</cudnp>
<cucons>1</cucons>
<cucul>0</cucul>
</control>
<bico>
<bi>
<idbi>
<cn>UR</cn>
<rc>
<pc1>0499418</pc1>
<pc2>VG3709N</pc2>
<car>0008</car>
<cc1>R</cc1>
<cc2>E</cc2>
</rc>
</idbi>
<dt>
<loine>
<cp>23</cp>
<cm>50</cm>
</loine>
<cmc>900</cmc>
<np>VILLACONEJOS DE ARRIBA</np>
<nm>MALAGA</nm>
<locs>
<lous>
<lourb>
<dir>
<cv>799</cv>
<tv>CL</tv>
<nv>calle</nv>
<pnp>2</pnp>
<snp>0</snp>
</dir>
<loint>
<es>1</es>
<pt>01</pt>
<pu>B</pu>
</loint>
<dp>29005</dp>
<dm>1</dm>
</lourb>
</lous>
</locs>
</dt>
<ldt>CL calle 2 Es:1 Pl:01 Pt:B 29005 Madrid (Madrid)</ldt>
<debi>
<luso>Residencial</luso>
<sfc>72</sfc>
<cpt>3,430000</cpt>
<ant>1979</ant>
</debi>
</bi>
<lcons>
<cons>
<lcd>VIVIENDA</lcd>
<dt>
<lourb>
<loint>
<es>1</es>
<pt>01</pt>
<pu>B</pu>
</loint>
</lourb>
</dt>
<dfcons>
<stl>72</stl>
</dfcons>
</cons>
</lcons>
</bico>
</consulta_dnp>
</Doc>")
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.