简体   繁体   中英

Creating a database of links using sqlite on python

I wrote the following code where I am able to follow the initial page to two new pages and repeat this process for 4 levels to record all url's from these pages. I want to create a database of all the links I have encountered. If I visited a page (ie followed it to gain access to more links) I want to record a 1 for that link and 0 if I have not visited the page.

    def getlinks(xurl):
     # given a Wikipedia article url,
     # return all links on that page to Wikipedia articles
     # (really should add error checking)
    from urllib.request import urlopen
    from bs4 import BeautifulSoup
    import re

    xlinks = [] # initialize list of links
    hpage = urlopen(xurl) # read/open page
    bs = BeautifulSoup(hpage, 'html.parser') # parse page

     # find all links in div named 'bodyContent'
     # such that they start with '/wiki/' and contain no colon
    for link in bs.find('div', {'id':'bodyContent'}).find_all('a',
href=re.compile('^(/wiki/)((?!:).)*$')):
        if 'href' in link.attrs:
            # make the url complete and add to list
            xlinks.append('https://en.wikipedia.org{}'.format(link.attrs['href']))
    return xlinks # return list of urls

maxlevel = 4 # levels deep to follow

# branches to follow from each page on each level before the last
numbranches = 2
tasks = [] # initialize task list
mastlinks = set() # initialize master set of urls
iurl = 'https://en.wikipedia.org/wiki/Kevin_Bacon' # first page
ilevel = 1 # first (top) level
mastlinks.add(iurl) # add first page to master set

# add current level and page to tasks
tasks.append((ilevel, iurl))

import sqlite3
import csv
import os

visited = 0

db_connection = sqlite3.connect('dd5.db')
cursor = db_connection.cursor()
cretab = '''CREATE TABLE IF NOT EXISTS links (link TEXT PRIMARY KEY, visited BIT)'''
cursor.execute(cretab)


for ix in range(40): # do no more than 40 pages
    if not tasks: # if no more tasks, we're done
        break

    # remove next task level, url from end of task list
    level, url = tasks.pop()
    print('\n', ix, 'level', level, url)
    visited = 1
    links = getlinks(url) # get links from current %page
    cursor.execute("INSERT OR IGNORE INTO links VALUES (?, ?)", (links, visited))
    print(len(links), 'article links')
    ulinks = set(links)
    print(len(ulinks), 'unique article links')
    newlinks = ulinks.difference(mastlinks)
    mastlinks = mastlinks.union(newlinks)
    print(len(newlinks), 'new unique article links')
    linklist = list(newlinks)
    cursor.execute("UPDATE links SET visited=? WHERE link=?", (visited, links))
    print('sample links:')
    for link in linklist[:10]:
        print(link)
    if level < maxlevel:
        for link in linklist[:numbranches]:
            print('following', link)

            # add next level link to tasks
            tasks.append((level + 1, link))

I keep getting the "InterfaceError: Error binding parameter 0 - probably unsupported type.' error. I am also not sure if my placement for sqlite related code is correct as I am a beginner in this area. Could you help? Thank you!

Well, links is a list. Using .execute() you cannot INSERT or UPDATE with a list.

You can loop over that list:

for link in links:
    cursor.execute("INSERT OR IGNORE INTO links VALUES (?, ?)", (link, visited))

Another potential solution is to use .executemany() which you could use in this way:

to_insert = []
for link in links:
    to_insert.append((link,visited))
cursor.executemany("INSERT OR IGNORE INTO links VALUES (?, ?)", to_insert)

A similar problem exists with your UPDATE code, and the above information applies to that query as well.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM