Issues with SQLite Table - Specifically foreign keys

Question

Okay the main issue I am having is I have a table that should not have any duplicate entries. This is because I was hoping to have a Primary Key which would be referenced in a separate table.

From what I understand about normalization it is better to have your database designed in this fashion. So right now I have a table that has a bunch of duplicate entries (there should be only 6 unique entries but there are 30 entries, each unique one is repeated 5 times).

How should I fix this? Should I fix this when importing the data, or by using a UNIQUE keyword. To note when I tried to use the UNIQUE feature this time around, it gave me an error as the data that I am importing indeed has duplicates entries.

EDIT:

This is what my items looks like:

from scrapy.item import Item, Field

class TeamStats(Item):
    # define the fields for your item here like:
    # name = scrapy.Field()

    team = Field()
    division = Field()
    rosterurl = Field()
    player_desc = Field()
    playerurl = Field()
    pass

class Player(Item):
    exp = Field()
    pass

This is what my code looks like:

import scrapy
import string
import re
from scrapy.selector import HtmlXPathSelector                                                                       ##needed to import xpath command
from scrapy.shell import inspect_response                                                                           ##needed for Response object
from nbastats.items import TeamStats, Player                                                                  ##needed to import player stats


class NbastatsSpider(scrapy.Spider):
    name = "nbaStats"

    start_urls = [
        "http://espn.go.com/nba/teams"                                                                              ##only start not allowed because had some issues when navigated to team roster pages
        ]
    def parse(self,response):
        items = []                                                                                                  ##array or list that stores TeamStats item
        i=0                                                                                                         ##counter needed for older code

        for division in response.xpath('//div[@id="content"]//div[contains(@class, "mod-teams-list-medium")]'):     
            for team in division.xpath('.//div[contains(@class, "mod-content")]//li'):
                item = TeamStats()

                item['division'] = division.xpath('.//div[contains(@class, "mod-header")]/h4/text()').extract()[0]            
                item['team'] = team.xpath('.//h5/a/text()').extract()[0]
                item['rosterurl'] = "http://espn.go.com" + team.xpath('.//div/span[2]/a[3]/@href').extract()[0]
                items.append(item)
                print(item['rosterurl'])
                request = scrapy.Request(item['rosterurl'], callback = self.parseWPNow)
                request.meta['play'] = item

                yield request

    def parseWPNow(self, response):
        item = response.meta['play']
        item = self.parseRoster(item, response)
        return item

    def parseRoster(self, item, response):
        players1 = []
        int = 0
        for players in response.xpath("//td[@class='sortcell']"):
            play = {}
            play['name'] = players.xpath("a/text()").extract()[0]
            play['position'] = players.xpath("following-sibling::td[1]").extract()[0]
            play['age'] = players.xpath("following-sibling::td[2]").extract()[0]
            play['height'] = players.xpath("following-sibling::td[3]").extract()[0]
            play['weight'] = players.xpath("following-sibling::td[4]").extract()[0]
            play['college'] = players.xpath("following-sibling::td[5]").extract()[0]
            play['salary'] = players.xpath("following-sibling::td[6]").extract()[0]
            players1.append(play)
        item['playerurl'] = response.xpath("//td[@class='sortcell']/a").extract()
        item['player_desc']=players1
        return item

This is what my pipeline looks like:

class NbastatsPipeline(object):

    def __init__(self):
        self.setupDBCon()
        self.createTables()

    def setupDBCon(self):
        self.con = lite.connect('test.db')
        self.cur = self.con.cursor()

    def createTables(self):
        self.dropTeamsTable()
        self.dropPlayersTable()
        self.dropDivsTable()

        self.createTeamsTable()
        self.createPlayersTable()
        self.createDivsTable()

    def createTeamsTable(self):
        self.cur.execute("CREATE TABLE IF NOT EXISTS Teams(P_id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, \
            team TEXT, \
            DivId INTEGER, \
            FOREIGN KEY (DivId) REFERENCES Divs1(Did) \
            )")

    def createDivsTable(self):
        self.cur.execute("CREATE TABLE IF NOT EXISTS Divs(Did INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, \
            division TEXT)")

    def createPlayersTable(self):
        self.cur.execute("CREATE TABLE IF NOT EXISTS Players(player_name TEXT, \
            salary TEXT, \
            weight INTEGER, \
            age INTEGER, \
            college TEXT )")

    def dropTeamsTable(self):
        self.cur.execute("DROP TABLE IF EXISTS Teams")
    def dropPlayersTable(self):
        self.cur.execute("DROP TABLE IF EXISTS Players")
    def dropDivsTable(self):
        self.cur.execute("DROP TABLE IF EXISTS Divs")

    def closeDB(self):
        self.con.close()

    def __del__(self):
        self.closeDB()

    def process_item(self, item, spider):
        for key, value in item.iteritems():
            if key == "division":
                print(item.get('division', ""))
                self.cur.execute("INSERT INTO Divs( division ) VALUES(?)", (item.get('division', ""),))
                self.con.commit() 
#                self.storeInDb(item) #this is the line you'll use when ready to completely pass item through to storeInDb but it'll be lower in code

        return item

Answer 1

The usual approach is to enforce uniqueness on the database level and handle it in your application.

For Scrapy specifically, if you insert the records in the pipeline , usually you have a construction like this:

import sqlite3
from scrapy.exceptions import DropItem     

try:
    cursor.execute("""
        INSERT INTO 
            table 
            (field1, field2) 
        VALUES (?, ?)""", (field1, field2))
except sqlite3.IntegrityError:
    raise DropItem('Duplicate entry')

See also examples of sqlite pipelines:

There is also a project called scrapy-dblite which provides a nice abstraction - you are not going down to SQL queries, there is a simple ORM built-in. For example:

from scrapy.exceptions import DropItem
from myproject.items import Product
import dblite

class StoreItemsPipeline(object):
    def __init__(self):
        self.ds = None

    def open_spider(self, spider):
        self.ds = open(Product, 'sqlite://db/products.sqlite:items', autocommit=True)

    def close_spider(self, spider):
        self.ds.commit()
        self.ds.close()

    def process_item(self, item, spider):
        if isinstance(item, Product):
            try:
                self.ds.put(item)
            except dblite.DuplicateItem:
                raise DropItem("Duplicate item found: %s" % item)
        else:
            raise DropItem("Unknown item type, %s" % type(item))
        return item

Issues with SQLite Table - Specifically foreign keys

Question

1 answers

solution1
0 2015-01-07 23:24:14

Issues with SQLite Table - Specifically foreign keys

Question

1 answers

solution1 0 2015-01-07 23:24:14

solution1
0 2015-01-07 23:24:14