I wrote the following code where I am able to follow the initial page to two new pages and repeat this process for 4 levels to record all url's from these pages. I want to create a database of all the links I have encountered. If I visited a page (ie followed it to gain access to more links) I want to record a 1 for that link and 0 if I have not visited the page.
def getlinks(xurl):
# given a Wikipedia article url,
# return all links on that page to Wikipedia articles
# (really should add error checking)
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
xlinks = [] # initialize list of links
hpage = urlopen(xurl) # read/open page
bs = BeautifulSoup(hpage, 'html.parser') # parse page
# find all links in div named 'bodyContent'
# such that they start with '/wiki/' and contain no colon
for link in bs.find('div', {'id':'bodyContent'}).find_all('a',
href=re.compile('^(/wiki/)((?!:).)*$')):
if 'href' in link.attrs:
# make the url complete and add to list
xlinks.append('https://en.wikipedia.org{}'.format(link.attrs['href']))
return xlinks # return list of urls
maxlevel = 4 # levels deep to follow
# branches to follow from each page on each level before the last
numbranches = 2
tasks = [] # initialize task list
mastlinks = set() # initialize master set of urls
iurl = 'https://en.wikipedia.org/wiki/Kevin_Bacon' # first page
ilevel = 1 # first (top) level
mastlinks.add(iurl) # add first page to master set
# add current level and page to tasks
tasks.append((ilevel, iurl))
import sqlite3
import csv
import os
visited = 0
db_connection = sqlite3.connect('dd5.db')
cursor = db_connection.cursor()
cretab = '''CREATE TABLE IF NOT EXISTS links (link TEXT PRIMARY KEY, visited BIT)'''
cursor.execute(cretab)
for ix in range(40): # do no more than 40 pages
if not tasks: # if no more tasks, we're done
break
# remove next task level, url from end of task list
level, url = tasks.pop()
print('\n', ix, 'level', level, url)
visited = 1
links = getlinks(url) # get links from current %page
cursor.execute("INSERT OR IGNORE INTO links VALUES (?, ?)", (links, visited))
print(len(links), 'article links')
ulinks = set(links)
print(len(ulinks), 'unique article links')
newlinks = ulinks.difference(mastlinks)
mastlinks = mastlinks.union(newlinks)
print(len(newlinks), 'new unique article links')
linklist = list(newlinks)
cursor.execute("UPDATE links SET visited=? WHERE link=?", (visited, links))
print('sample links:')
for link in linklist[:10]:
print(link)
if level < maxlevel:
for link in linklist[:numbranches]:
print('following', link)
# add next level link to tasks
tasks.append((level + 1, link))
I keep getting the "InterfaceError: Error binding parameter 0 - probably unsupported type.' error. I am also not sure if my placement for sqlite related code is correct as I am a beginner in this area. Could you help? Thank you!
Well, links
is a list. Using .execute()
you cannot INSERT
or UPDATE
with a list.
You can loop over that list:
for link in links:
cursor.execute("INSERT OR IGNORE INTO links VALUES (?, ?)", (link, visited))
Another potential solution is to use .executemany()
which you could use in this way:
to_insert = []
for link in links:
to_insert.append((link,visited))
cursor.executemany("INSERT OR IGNORE INTO links VALUES (?, ?)", to_insert)
A similar problem exists with your UPDATE
code, and the above information applies to that query as well.
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.