简体   繁体   中英

Issues with SQLite Table - Specifically foreign keys

Okay the main issue I am having is I have a table that should not have any duplicate entries. This is because I was hoping to have a Primary Key which would be referenced in a separate table.

From what I understand about normalization it is better to have your database designed in this fashion. So right now I have a table that has a bunch of duplicate entries (there should be only 6 unique entries but there are 30 entries, each unique one is repeated 5 times).

How should I fix this? Should I fix this when importing the data, or by using a UNIQUE keyword. To note when I tried to use the UNIQUE feature this time around, it gave me an error as the data that I am importing indeed has duplicates entries.

EDIT:

This is what my items looks like:

from scrapy.item import Item, Field

class TeamStats(Item):
    # define the fields for your item here like:
    # name = scrapy.Field()

    team = Field()
    division = Field()
    rosterurl = Field()
    player_desc = Field()
    playerurl = Field()
    pass

class Player(Item):
    exp = Field()
    pass

This is what my code looks like:

import scrapy
import string
import re
from scrapy.selector import HtmlXPathSelector                                                                       ##needed to import xpath command
from scrapy.shell import inspect_response                                                                           ##needed for Response object
from nbastats.items import TeamStats, Player                                                                  ##needed to import player stats


class NbastatsSpider(scrapy.Spider):
    name = "nbaStats"

    start_urls = [
        "http://espn.go.com/nba/teams"                                                                              ##only start not allowed because had some issues when navigated to team roster pages
        ]
    def parse(self,response):
        items = []                                                                                                  ##array or list that stores TeamStats item
        i=0                                                                                                         ##counter needed for older code

        for division in response.xpath('//div[@id="content"]//div[contains(@class, "mod-teams-list-medium")]'):     
            for team in division.xpath('.//div[contains(@class, "mod-content")]//li'):
                item = TeamStats()

                item['division'] = division.xpath('.//div[contains(@class, "mod-header")]/h4/text()').extract()[0]            
                item['team'] = team.xpath('.//h5/a/text()').extract()[0]
                item['rosterurl'] = "http://espn.go.com" + team.xpath('.//div/span[2]/a[3]/@href').extract()[0]
                items.append(item)
                print(item['rosterurl'])
                request = scrapy.Request(item['rosterurl'], callback = self.parseWPNow)
                request.meta['play'] = item

                yield request

    def parseWPNow(self, response):
        item = response.meta['play']
        item = self.parseRoster(item, response)
        return item

    def parseRoster(self, item, response):
        players1 = []
        int = 0
        for players in response.xpath("//td[@class='sortcell']"):
            play = {}
            play['name'] = players.xpath("a/text()").extract()[0]
            play['position'] = players.xpath("following-sibling::td[1]").extract()[0]
            play['age'] = players.xpath("following-sibling::td[2]").extract()[0]
            play['height'] = players.xpath("following-sibling::td[3]").extract()[0]
            play['weight'] = players.xpath("following-sibling::td[4]").extract()[0]
            play['college'] = players.xpath("following-sibling::td[5]").extract()[0]
            play['salary'] = players.xpath("following-sibling::td[6]").extract()[0]
            players1.append(play)
        item['playerurl'] = response.xpath("//td[@class='sortcell']/a").extract()
        item['player_desc']=players1
        return item

This is what my pipeline looks like:

class NbastatsPipeline(object):

    def __init__(self):
        self.setupDBCon()
        self.createTables()

    def setupDBCon(self):
        self.con = lite.connect('test.db')
        self.cur = self.con.cursor()

    def createTables(self):
        self.dropTeamsTable()
        self.dropPlayersTable()
        self.dropDivsTable()

        self.createTeamsTable()
        self.createPlayersTable()
        self.createDivsTable()

    def createTeamsTable(self):
        self.cur.execute("CREATE TABLE IF NOT EXISTS Teams(P_id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, \
            team TEXT, \
            DivId INTEGER, \
            FOREIGN KEY (DivId) REFERENCES Divs1(Did) \
            )")

    def createDivsTable(self):
        self.cur.execute("CREATE TABLE IF NOT EXISTS Divs(Did INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, \
            division TEXT)")

    def createPlayersTable(self):
        self.cur.execute("CREATE TABLE IF NOT EXISTS Players(player_name TEXT, \
            salary TEXT, \
            weight INTEGER, \
            age INTEGER, \
            college TEXT )")

    def dropTeamsTable(self):
        self.cur.execute("DROP TABLE IF EXISTS Teams")
    def dropPlayersTable(self):
        self.cur.execute("DROP TABLE IF EXISTS Players")
    def dropDivsTable(self):
        self.cur.execute("DROP TABLE IF EXISTS Divs")

    def closeDB(self):
        self.con.close()

    def __del__(self):
        self.closeDB()

    def process_item(self, item, spider):
        for key, value in item.iteritems():
            if key == "division":
                print(item.get('division', ""))
                self.cur.execute("INSERT INTO Divs( division ) VALUES(?)", (item.get('division', ""),))
                self.con.commit() 
#                self.storeInDb(item) #this is the line you'll use when ready to completely pass item through to storeInDb but it'll be lower in code

        return item

The usual approach is to enforce uniqueness on the database level and handle it in your application.

For Scrapy specifically, if you insert the records in the pipeline , usually you have a construction like this:

import sqlite3
from scrapy.exceptions import DropItem     

try:
    cursor.execute("""
        INSERT INTO 
            table 
            (field1, field2) 
        VALUES (?, ?)""", (field1, field2))
except sqlite3.IntegrityError:
    raise DropItem('Duplicate entry')

See also examples of sqlite pipelines:


There is also a project called scrapy-dblite which provides a nice abstraction - you are not going down to SQL queries, there is a simple ORM built-in. For example:

from scrapy.exceptions import DropItem
from myproject.items import Product
import dblite

class StoreItemsPipeline(object):
    def __init__(self):
        self.ds = None

    def open_spider(self, spider):
        self.ds = open(Product, 'sqlite://db/products.sqlite:items', autocommit=True)

    def close_spider(self, spider):
        self.ds.commit()
        self.ds.close()

    def process_item(self, item, spider):
        if isinstance(item, Product):
            try:
                self.ds.put(item)
            except dblite.DuplicateItem:
                raise DropItem("Duplicate item found: %s" % item)
        else:
            raise DropItem("Unknown item type, %s" % type(item))
        return item

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM