简体   繁体   中英

How to read xml file in R and to data.frame

library(XML)
file <-"E:/aaa.xml"
doc = xmlInternalTreeParse(file)
ns=names(xmlNamespace(xmlRoot(doc)))
patient=getNodeSet(doc, path=paste("/", ns, ":tcga_bcr/", ns,":patient", sep=""))
row=xmlToDataFrame(nodes=patient, stringsAsFactors = F)

shared_stage:stage_event have many children nodes, How to exact each children node as column.

If nodes have preferred_name, use preferred_name as data.frame column names.

aaa.xml:

<?xml version="1.0" encoding="UTF-8"?>
<brca:tcga_bcr xsi:schemaLocation="http://tcga.nci/bcr/xml/clinical/brca/2.7 http://tcga-data.nci.nih.gov/docs/xsd/BCR/tcga.nci/bcr/xml/clinical/brca/2.7/TCGA_BCR.BRCA_Clinical.xsd" schemaVersion="2.7" xmlns:brca="http://tcga.nci/bcr/xml/clinical/brca/2.7" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:admin="http://tcga.nci/bcr/xml/administration/2.7" xmlns:clin_shared="http://tcga.nci/bcr/xml/clinical/shared/2.7" xmlns:shared="http://tcga.nci/bcr/xml/shared/2.7" xmlns:brca_shared="http://tcga.nci/bcr/xml/clinical/brca/shared/2.7" xmlns:shared_stage="http://tcga.nci/bcr/xml/clinical/shared/stage/2.7" xmlns:brca_nte="http://tcga.nci/bcr/xml/clinical/brca/shared/new_tumor_event/2.7/1.0" xmlns:nte="http://tcga.nci/bcr/xml/clinical/shared/new_tumor_event/2.7" xmlns:follow_up_v2.1="http://tcga.nci/bcr/xml/clinical/brca/followup/2.7/2.1" xmlns:rx="http://tcga.nci/bcr/xml/clinical/pharmaceutical/2.7" xmlns:rad="http://tcga.nci/bcr/xml/clinical/radiation/2.7">
<brca:patient>
    <admin:additional_studies/>
    <clin_shared:tumor_tissue_site preferred_name="submitted_tumor_site" display_order="9999" cde="3427536" cde_ver="2.000" xsd_ver="2.6" tier="2" owner="TSS" procurement_status="Completed" restricted="false" source_system_identifier="175314">Breast</clin_shared:tumor_tissue_site>
    <clin_shared:race_list>
        <clin_shared:race preferred_name="race" display_order="12" cde="2192199" cde_ver="1.000" xsd_ver="1.8" tier="2" owner="TSS" procurement_status="Completed" restricted="false" source_system_identifier="175301">WHITE</clin_shared:race>
    </clin_shared:race_list>
    <shared:bcr_patient_barcode preferred_name="" display_order="9999" cde="2673794" cde_ver="" xsd_ver="1.8" owner="TSS" procurement_status="Completed" restricted="false">TCGA-A2-A0EV</shared:bcr_patient_barcode>
    <shared:tissue_source_site cde="" cde_ver="" xsd_ver="2.4" owner="TSS" procurement_status="Completed" restricted="false">A2</shared:tissue_source_site>
    <shared_stage:stage_event system="AJCC">
        <shared_stage:system_version preferred_name="ajcc_staging_edition" display_order="51" cde="2722309" cde_ver="1.000" xsd_ver="2.6" tier="1" owner="TSS" procurement_status="Completed" restricted="false" source_system_identifier="1080001">6th</shared_stage:system_version>
        <shared_stage:tnm_categories>
            <shared_stage:pathologic_categories>
                <shared_stage:pathologic_T preferred_name="ajcc_tumor_pathologic_pt" display_order="52" cde="3045435" cde_ver="1.000" xsd_ver="2.6" tier="1" owner="TSS" procurement_status="Completed" restricted="false" source_system_identifier="175336">T1c</shared_stage:pathologic_T>
            </shared_stage:pathologic_categories>
        </shared_stage:tnm_categories>
    </shared_stage:stage_event>       
    <rx:drugs/>
    <rad:radiations/>
</brca:patient>
</brca:tcga_bcr>

data.frame

submitted_tumor_site  race  bcr_patient_barcode  ajcc_staging_edition ajcc_tumor_pathologic_pt
Breast                WHITE  TCGA-A2-A0EV            6th               T1c

Since you have nested descendants and differing namespaces, consider simply running xpaths to each needed xml value. Then bind them together into a dataframe. An outer lapply() is run across the number of brca:patient nodes with a checkpath() function to account for possible missing children or descendant nodes:

patientnum <- 1:length(xpathSApply(doc, "//brca:patient"))

checkpath <- function(xpath){
  val <- ifelse(length(xpath) > 0, xpath[[1]], NA)
}

patientdata <- lapply(patientnum, function(i){
  temp <- c(checkpath(xpathSApply(doc, paste0("//brca:patient[",i,"]/clin_shared:tumor_tissue_site"), xmlValue)),
            checkpath(xpathSApply(doc, paste0("//brca:patient[",i,"]/descendant::clin_shared:race"), xmlValue)),
            checkpath(xpathSApply(doc, paste0("//brca:patient[",i,"]/descendant::shared:bcr_patient_barcode"), xmlValue)),
            checkpath(xpathSApply(doc, paste0("//brca:patient[",i,"]/descendant::shared_stage:system_version"), xmlValue)),
            checkpath(xpathSApply(doc, paste0("//brca:patient[",i,"]/descendant::shared_stage:pathologic_T"), xmlValue)))

  temp <- setNames(temp, c("tumor_tissue_site", "race", "bcr_patient_barcode", "system_version", "pathologic_T"))
})

patients <- do.call(rbind, patientdata)
patients <- data.frame(patients, stringsAsFactors = FALSE)

Alternatively, you can still use xmlToDataFrame() but requires flattening and simplifying your XML which can be done with XSLT (the XML transformation language and sibling to XPath).

While R does not have a dedicated, universal library for XSLT, you can use external processors including ones in other languages (Python, Java, PHP, even Excel VBA), dedicated .exe (Saxon, Xalan), or command line interpreters (PowerShell, Bash). And R can call each one with system() :

XSLT Script

<xsl:transform xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
               xmlns:brca="http://tcga.nci/bcr/xml/clinical/brca/2.7"
               xmlns:clin_shared="http://tcga.nci/bcr/xml/clinical/shared/2.7"
               xmlns:shared="http://tcga.nci/bcr/xml/shared/2.7"              
               xmlns:shared_stage="http://tcga.nci/bcr/xml/clinical/shared/stage/2.7">
<xsl:output version="1.0" encoding="UTF-8" indent="yes" />
<xsl:strip-space elements="*"/>

  <xsl:template match="/brca:tcga_bcr">
    <xsl:element name="{local-name()}">
      <xsl:apply-templates select="brca:patient"/>
    </xsl:element>
  </xsl:template>    

  <xsl:template match="brca:patient">    
    <xsl:element name="{local-name()}">
        <tumor_tissue_site><xsl:value-of select="clin_shared:tumor_tissue_site"/></tumor_tissue_site>
        <race><xsl:value-of select="descendant::clin_shared:race"/></race>
        <bcr_patient_barcode><xsl:value-of select="descendant::shared:bcr_patient_barcode"/></bcr_patient_barcode>
        <system_version><xsl:value-of select="descendant::shared_stage:system_version"/></system_version>
        <pathologic_T><xsl:value-of select="descendant::shared_stage:pathologic_T"/></pathologic_T>
    </xsl:element>
  </xsl:template>

</xsl:transform>

R Script

system("command line call to transform xml source with xslt")
# system('python "path/to/transformation_script.py"')          ' EXAMPLE: PYTHON SCRIPT

doc <- xmlParse("path/to/transformed.xml")
doc
# <?xml version="1.0" encoding="UTF-8"?>
# <tcga_bcr>
#   <patient>
#     <tumor_tissue_site>Breast</tumor_tissue_site>
#     <race>WHITE</race>
#     <bcr_patient_barcode>TCGA-A2-A0EV</bcr_patient_barcode>
#     <system_version>6th</system_version>
#     <pathologic_T>T1c</pathologic_T>
#   </patient>
# </tcga_bcr>

patients <- xmlToDataFrame(nodes = getNodeSet(doc, "//patient"), stringsAsFactors = FALSE)
doc = xmlInternalTreeParse(file)
ns=names(xmlNamespace(xmlRoot(doc)))
patient=getNodeSet(doc, path=paste("/", ns, ":tcga_bcr/", ns,":patient", sep=""))

patient.fields=xmlChildren(patient[[1]])
patient.fields[[2]]

The result were

<clin_shared:tumor_tissue_site preferred_name="submitted_tumor_site" display_order="9999" cde="3427536" cde_ver="2.000" xsd_ver="2.6" tier="2" owner="TSS" procurement_status="Completed" restricted="false" source_system_identifier="175314">Breast</clin_shared:tumor_tissue_site> 

How to abstract the contents of preferred_name in patient.fields[[2]]?

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM