简体   繁体   中英

Importing xml data into a MYSQL db using python

Having a problem with attributes when importing XML data into SQL I have tried this 4 or 5 different ways but must be looking at the problem incorrectly. I am new to python but have experience with SQL and some other languages Have tried this a few different ways using xpath and it still doesn't work

<?xml version="1.0" encoding="UTF-8"?>
<PARTS>
    <Header>
        <Version>6.5</Version>
    </Header>
    <Items>
        <Item MaintenanceType="A">
            <HazardousMaterialCode>N</HazardousMaterialCode>
            <ItemLevelGTIN GTINQualifier="UP">00651860733074</ItemLevelGTIN>
            <PartNumber>14-230615</PartNumber>
            <BrandAAIAID>BBGL</BrandAAIAID>
            <BrandLabel>Bilstein</BrandLabel>
            <ACESApplications>Y</ACESApplications>
            <ItemQuantitySize UOM="EA">1.0</ItemQuantitySize>
            <ContainerType>BX</ContainerType>
            <QuantityPerApplication Qualifier="MAX" UOM="EA">1</QuantityPerApplication>
        </Item>
    </Items>
</PARTS>

from xml.etree import ElementTree
import mysql.connector

file_name = 'bsn.xml'
dom = ElementTree.parse(file_name)

mydb = mysql.connector.connect(user='frank', password='xxxxxx', host='127.0.0.1', database='scoresre', auth_plugin='mysql_native_password')
mycursor = mydb.cursor()

item = dom.findall('Items/Item')

for x in item:
    PartNumber = x.find('PartNumber').text
    BrandAAIAID = x.find('BrandAAIAID').text
    BrandLabel = x.find('BrandLabel').text
    ItemLevelGTIN = x.find('ItemLevelGTIN').text
    GTINQualifier = x.find('.//GTINQualifier[@attr="UP"]')
    print(PartNumber, BrandAAIAID, BrandLabel, ItemLevelGTIN, GTINQualifier)

    val = (PartNumber, BrandAAIAID, BrandLabel, ItemLevelGTIN, GTINQualifier)

    sql = "INSERT INTO scoreitem (B15_PartNumber, B20_BrandAAIAID, B25_BrandLabel, B10_ItemLevelGTIN, " \
          "B11_GTINQualifier) VALUES (%s, %s, %s, %s, %s)"

    mycursor.execute(sql, val)
    mydb.commit()

The code is not importing the attributes under GTINQualifier="UP" -- UP is coming in as null ItemQuantitySize UOM="EA" -- EA comes in null when i use the same above syntax and Qualifier="MAX" UOM="EA" MAX and EA also come in as NULL. Thank you ahead of time

Solution-A:

This solution is along what you tried to do.

items = dom.findall('Items/Item')
# use a '@' before any attribute in the list of target columns. This will help 
# us treat the attributes separately from the regular tags.
target_cols = ['PartNumber', 'BrandAAIAID', 'BrandLabel', 'ItemLevelGTIN', '@GTINQualifier']

for item in items:
    instance_dict = dict()
    for col in target_cols:
        label = col.replace("@","")
        if col.startswith("@"):
            instance_dict.update({col: item.find('.//*[@{}]'.format(label)).attrib[label]})
        else:
            instance_dict.update({col: item.find(label).text})

    val = tuple(instance_dict[col_name] for col_name in target_cols)
    print(instance_dict)
    # **write to db here**

Solution-B:

Here is a solution for reading the xml data in as a dataframe/dict. And then you could grab the fields that you need to write to the db.

Let's make some data

I introduced a dummy second <Item></Item> tag to check if this processes multiple tags properly.

xml_string = """
<PARTS>
    <Header>
        <Version>6.5</Version>
    </Header>
    <Items>
        <Item MaintenanceType="A">
            <HazardousMaterialCode>N</HazardousMaterialCode>
            <ItemLevelGTIN GTINQualifier="UP">00651860733074</ItemLevelGTIN>
            <PartNumber>14-230615</PartNumber>
            <BrandAAIAID>BBGL</BrandAAIAID>
            <BrandLabel>Bilstein</BrandLabel>
            <ACESApplications>Y</ACESApplications>
            <ItemQuantitySize UOM="EA">1.0</ItemQuantitySize>
            <ContainerType>BX</ContainerType>
            <QuantityPerApplication Qualifier="MAX" UOM="EA">1</QuantityPerApplication>
        </Item>
        <Item MaintenanceType="B">
            <HazardousMaterialCode>N</HazardousMaterialCode>
            <ItemLevelGTIN GTINQualifier="UP">00651860733084</ItemLevelGTIN>
            <PartNumber>14-230620</PartNumber>
            <BrandAAIAID>BBGL</BrandAAIAID>
            <BrandLabel>BilsteinZ</BrandLabel>
            <ACESApplications>Y</ACESApplications>
            <ItemQuantitySize UOM="EA">1.0</ItemQuantitySize>
            <ContainerType>BX</ContainerType>
            <QuantityPerApplication Qualifier="MAX" UOM="EA">1</QuantityPerApplication>
        </Item>
    </Items>
</PARTS>
"""

Solution

Optionally use pandas to see the entire xml as a dataframe.

Import Libraries

import pandas as pd
from xml.etree import ElementTree as ET
import mysql.connector

Now recursively read each tag and write to the db.

target_cols = ['PartNumber', 'BrandAAIAID', 'BrandLabel', 'ItemLevelGTIN', 'GTINQualifier']

sql = "INSERT INTO scoreitem (B15_PartNumber, B20_BrandAAIAID, B25_BrandLabel, B10_ItemLevelGTIN, " \
      "B11_GTINQualifier) VALUES (%s, %s, %s, %s, %s)"

dict_cols = [None] # (either a list with a None) or (is equal to target_cols)

write_to_db = False # Set this to true when you write to db

file_name = 'bsn.xml'

# Let us test with the xml_string first
# set 'xml_source_is_file = True' 
# when working with a file.
xml_source_is_file = False
if xml_source_is_file:
    dom = ET.parse(file_name)
else:
    dom = ET.fromstring(xml_string)

if write_to_db:
    mydb = mysql.connector.connect(user='frank', password='xxxxxx', 
                                   host='127.0.0.1',         
                                   database='scoresre', 
                                   auth_plugin='mysql_native_password')
    mycursor = mydb.cursor()

consolidated_dict = dict()

for xx in list(dom):
    if xx.tag == 'Items':
        #print(list(xx))
        for i, item in enumerate(xx):
            instance_dict = dict()
            #print(list(item))
            for ee in list(item):
                #print(ee)
                kee = ee.tag.replace('@','')
                if (kee in dict_cols) or (dict_cols[0] is None):
                    instance_dict.update({kee: ee.text})
                if isinstance(ee, dict):                    
                    for e in list(ee):
                        #print(e)
                        ke = e.tag.replace('@','')
                        if (ke in dict_cols) or (dict_cols[0] is None):
                            instance_dict.update({ke: e.text})
                        temp_dict = e.attrib
                        if len(temp_dict) > 0:
                            for jj in temp_dict.keys():
                                kjj = jj.replace('@','')
                                if (kjj in dict_cols) or (dict_cols[0] is None):
                                    instance_dict.update({kjj: temp_dict.get(jj)})
                temp_dict = ee.attrib
                if len(temp_dict) > 0:
                    for jj in temp_dict.keys():
                        kjj = jj.replace('@','')
                        if (kjj in dict_cols) or (dict_cols[0] is None):
                            instance_dict.update({kjj: temp_dict.get(jj)})                
            #print(instance_dict)
            consolidated_dict.update({i: instance_dict})
            val = tuple(instance_dict[col_name] for col_name in target_cols)
            print(val)
            # Write to db here
            if write_to_db:
                mycursor.execute(sql, val)
                mydb.commit()

df = pd.DataFrame(consolidated_dict).T
df

PS : Note that there are two switches , write_to_db and xml_source_is_file . In your case, you would need to set both of them as True to write to the database and read data in from the xml file.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM