xml2 parse SQLServer Generated XML to dataframe

I'm wrestling with a data source that has changed XML reporting formats, resulting in a far more verbose file that has me completely stumped on how grab the name value pairs. How do I construct an appropriate xpath to access the name value pairs with the XML below?

library(xml2)

xml <- read_xml('
<root xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
    <xsd:schema targetNamespace="urn:schemas-microsoft-com:sql:SqlRowSet1" xmlns:schema="urn:schemas-microsoft-com:sql:SqlRowSet1" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:sqltypes="http://schemas.microsoft.com/sqlserver/2004/sqltypes" elementFormDefault="qualified">
        <xsd:import namespace="http://schemas.microsoft.com/sqlserver/2004/sqltypes" schemaLocation="http://schemas.microsoft.com/sqlserver/2004/sqltypes/sqltypes.xsd"/>
        <xsd:element name="ogridroles">
            <xsd:complexType>
                <xsd:sequence>
                    <xsd:element name="ogrid_cde" type="sqltypes:int" nillable="1"/>
                    <xsd:element name="role" nillable="1">
                        <xsd:simpleType>
                            <xsd:restriction base="sqltypes:char" sqltypes:localeId="1033" sqltypes:sqlCompareOptions="IgnoreCase IgnoreKanaType IgnoreWidth" sqltypes:sqlSortId="52">
                                <xsd:maxLength value="1"/>
                            </xsd:restriction>
                        </xsd:simpleType>
                    </xsd:element>
                </xsd:sequence>
            </xsd:complexType>
        </xsd:element>
    </xsd:schema>
    <ogridroles xmlns="urn:schemas-microsoft-com:sql:SqlRowSet1">
        <ogrid_cde>28</ogrid_cde>
        <role>T</role>
    </ogridroles>
    <ogridroles xmlns="urn:schemas-microsoft-com:sql:SqlRowSet1">
        <ogrid_cde>75</ogrid_cde>
        <role>T</role>
    </ogridroles>
    <ogridroles xmlns="urn:schemas-microsoft-com:sql:SqlRowSet1">
        <ogrid_cde>93</ogrid_cde>
        <role>O</role>
    </ogridroles>
    <ogridroles xmlns="urn:schemas-microsoft-com:sql:SqlRowSet1">
        <ogrid_cde>135</ogrid_cde>
        <role>O</role>
    </ogridroles>
    <ogridroles xmlns="urn:schemas-microsoft-com:sql:SqlRowSet1">
        <ogrid_cde>149</ogrid_cde>
        <role>O</role>
    </ogridroles>
    <ogridroles xmlns="urn:schemas-microsoft-com:sql:SqlRowSet1">
        <ogrid_cde>168</ogrid_cde>
        <role>O</role>
    </ogridroles>
</root>')

# doesn't return a nodeset
xml_find_all(xml, ".//ogridroles")
#> {xml_nodeset (0)}

Created on 2019-01-04 by the reprex package (v0.2.1)

The XML schema was making the data nodeset tricky to access. Using xml2::read_html to read the source XML and then using rvest::html_nodes to convert the data to a nodeset allowed me to access the underlying name value pairs. This was informed by the SO post here

Here is a working example:

library(xml2)
library(rvest)
library(tidyverse)

xml <- read_html('
<root xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
    <xsd:schema targetNamespace="urn:schemas-microsoft-com:sql:SqlRowSet1" xmlns:schema="urn:schemas-microsoft-com:sql:SqlRowSet1" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:sqltypes="http://schemas.microsoft.com/sqlserver/2004/sqltypes" elementFormDefault="qualified">
        <xsd:import namespace="http://schemas.microsoft.com/sqlserver/2004/sqltypes" schemaLocation="http://schemas.microsoft.com/sqlserver/2004/sqltypes/sqltypes.xsd"/>
        <xsd:element name="ogridroles">
            <xsd:complexType>
                <xsd:sequence>
                    <xsd:element name="ogrid_cde" type="sqltypes:int" nillable="1"/>
                    <xsd:element name="role" nillable="1">
                        <xsd:simpleType>
                            <xsd:restriction base="sqltypes:char" sqltypes:localeId="1033" sqltypes:sqlCompareOptions="IgnoreCase IgnoreKanaType IgnoreWidth" sqltypes:sqlSortId="52">
                                <xsd:maxLength value="1"/>
                            </xsd:restriction>
                        </xsd:simpleType>
                    </xsd:element>
                </xsd:sequence>
            </xsd:complexType>
        </xsd:element>
    </xsd:schema>
    <ogridroles xmlns="urn:schemas-microsoft-com:sql:SqlRowSet1">
        <ogrid_cde>28</ogrid_cde>
        <role>T</role>
    </ogridroles>
    <ogridroles xmlns="urn:schemas-microsoft-com:sql:SqlRowSet1">
        <ogrid_cde>75</ogrid_cde>
        <role>T</role>
    </ogridroles>
    <ogridroles xmlns="urn:schemas-microsoft-com:sql:SqlRowSet1">
        <ogrid_cde>93</ogrid_cde>
        <role>O</role>
    </ogridroles>
    <ogridroles xmlns="urn:schemas-microsoft-com:sql:SqlRowSet1">
        <ogrid_cde>135</ogrid_cde>
        <role>O</role>
    </ogridroles>
    <ogridroles xmlns="urn:schemas-microsoft-com:sql:SqlRowSet1">
        <ogrid_cde>149</ogrid_cde>
        <role>O</role>
    </ogridroles>
    <ogridroles xmlns="urn:schemas-microsoft-com:sql:SqlRowSet1">
        <ogrid_cde>168</ogrid_cde>
        <role>O</role>
    </ogridroles>
</root>')

ogridroles <- html_nodes(xml, xpath = "//ogridroles")

fieldnames <- xml_name(xml_find_all(ogridroles, ".//*"))
fields <- xml_text(xml_find_all(ogridroles, ".//*"))
df <- tibble(fieldnames, fields) %>% 
    # assumes that each xml node has the the same fields
    group_by(fieldnames) %>% 
    mutate(index = 1,
           index = cumsum(index)) %>% 
    spread(key = "fieldnames", value = "fields")

df
#> # A tibble: 6 x 3
#>   index ogrid_cde role 
#>   <dbl> <chr>     <chr>
#> 1     1 28        T    
#> 2     2 75        T    
#> 3     3 93        O    
#> 4     4 135       O    
#> 5     5 149       O    
#> 6     6 168       O

Created on 2019-01-04 by the reprex package (v0.2.1)

2 Likes

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.