简体   繁体   中英

OperationalError: near “state”: syntax error

I made simple crawler using python and sqlite3. but there are some errors in the cmd screen. so I have searched this kind of error from stackoverflow.com. but i can't find the solution. some Q&A suggested me that I have to use ? instead of % on the sqlite command like SELECT COUNT(*) FROM urls WHERE url='%s'state=1"%url . but it was not working.

here is the error.

Traceback (most recent call last):
  File "C:\Python27\crawl.py", line 239, in (module)
    parseArticle( u )
  File "C:\Python27\crawl.py", line 146, in parseArticle
    gaterNeighborInfo(soup)
  File "C:\Python27\crawl.py", line 68, in gaterNeighborInfo
    if url and url.startswith('http://') and db.isCrawledURL(url)<1:
  File "C:\Python27\crawl.py", line 217, in isCrawledURL
    self.cursor.execute("SELECT COUNT(*) FROM urls WHERE url='%s'state=1"%url) 
OperationalError: near "state": syntax error

As you see, this error seems to be hierarchical. but I don't know what is wrong and where this error starts.

here is the source code.

# -*- coding: utf-8 -*-

from BeautifulSoup import BeautifulSoup
import robotparser
import urllib2
import time, traceback, re, sys, os
import sqlite3

crawler_name = 'python_daum_crawler'
mainpage = 'http://blog.daum.net/'
mainpath = './data/'

# robot parser를 설정합니다.
rp = robotparser.RobotFileParser(mainpage + 'robot.txt')
rp.read()

def canFetch(url):
    "수집 가능 여부를 체크합니다."
    return rp.can_fetch(crawler_name, url)

def getContent(url, delay=1):
    "웹문서를 다운로드 합니다."
    time.sleep(delay)

    if not canFetch(url):
        # 웹마스터가 수집을 원치 않는 페이지는 수집을 하지 않습니다.
        print('This url can NOT be fetched by our crawler :', url)
        return None
    try:
        opener = urllib2.build_opener()
        opener.addheaders = [('User-agent', crawler_name)]
        contents = opener.open(url).read()
    except:
        traceback.print_exc()
        return None
    return contents

def getArticleInfo(soup):
    "daum blog 내의 article info를 얻어 옵니다."

    rBlog = re.compile('.+blog.daum.net/|w+/|d+.*?')
    URLs = soup('a',{'href':rBlog})

    return [ u.get('href').split('?')[0] for u in URLs ]

def getOwnArticles(contents):
    "해당 블로그에 포함되는 글의 목록을 가져옵니다."
    ret = []
    soup = BeautifulSoup(contents)
    rBlog = re.compile('.+/BlogView.+')
    for u in soup('a', {'href':rBlog}):
        href = u.get('href')
        article = href.split('articleno=')[1].split('&')[0]
        if ret.count(article)<1:
            ret.append(article)
    return ret

def gatherNeighborInfo(soup):
    "이웃 블로거/혹은 다녀간 블로거 정보를 수집합니다."

    #daum blog 관련 주소를 찾습니다.
    rBlog = re.compile('http://blog.daum.net/|w+')
    Neighbors = soup('a',{'href':rBlog})
    cnt = 0
    for n in Neighbors:
        url = n.get('href')
        blogname = url.split('/')[-1]
        if url and url.startswith('http://') and db.isCrawledURL(url)<1:
            db.insertURL( url, 1 )

            url2 = getRedirectedURL(url)
            if not url2: continue
            re_url = 'http://blog.daum.net' + url2
            body = getContent(re_url, 0)
            if body:
                for u in getOwnArticles(body):
                    #자신의 글 주소를 db에 저장합니다.
                    fullpath = 'http://blog.daum.net/'+blogname+'/'+u
                    cnt += db.insertURL(fullpath)
    if cnt>0: print('%d neighbor articles inserted'%cnt)

def getRedirectedURL(url):
    "본문에 해당하는 프레임의 url을 얻어옵니다."
    contents = getContent(url)
    if not contents: return None

    #redirect
    try:
        soup = BeautifulSoup(contents)
        frame = soup('frame')
        src = frame[0].get('src')
    except:
        src = None
    return src

def getBody(soup, parent):
    "본문 텍스트를 구합니다."

    #본문 주소를 포함한 iframe을 찾습니다.
    rSrc = re.compile('.+/ArticleContentsView.+')
    iframe = soup('iframe',{'src':rSrc})
    if len(iframe)>0:
        src = iframe[0].get('src')
        iframe_src = 'http://blog.daum.net'+src

        #그냥 request하면 안 되고, referer를 지정해야 browser를 통해 요청한 것으로 인식합니다.
        req = urllib2.Request(iframe_src)
        req.add_header('Refere', parent)
        body = urllib2.urlopen(req).read()
        soup = BeautifulSoup(body)
        return str(soup.body)
    else:
        print('NULL contents')
        return ''

def parseArticle(url):
    "해당 url을 parsing하고 저장합니다."

    #blog id와 article id를 얻습니다.
    article_id = url.split('/')[-1]
    blog_id = url.split('/')[-2]

    #redirect된 주소를 얻어 옵니다.
    newURL = getRedirectedURL(url)

    if newURL:
        try:
            #blog 디렉터리를 만듭니다.
            os.mkdir(mainpath+blog_id)
        except:
            #디렉터리를 만들다 에러가 난 경우 무시합니다.
            pass

        newURL = 'http://blog.daum.net'+newURL
        contents = getContent(newURL, 0)
        if not contents:
            print('Null Contents...')
            #해당 url이 유효하지 않은 경우 에러(-1)로 표시합니다.
            db.updateURL(url, -1)
            return

        #HTML을 파싱합니다.
        soup = BeautifulSoup(contents)

        #이웃 블로거 정보가 있나 확인합니다.
        gatherNeighborInfo(soup)

        #블로그 URL이 있을 경우 db에 삽입합니다.
        n=0
        for u in getArticleInfo(soup):
            n += db.insertURL(u)
        if n>0: print('inserted %d urls from %s'%(n,url))

        #title을 얻습니다.
        sp = contents.find('<title>')
        if sp>-1:
            ep = contents[sp+7:].find('<title>')
            title = contents[sp+7:sp+ep+7]
        else:
            title = ''

        #본문 HTML을 보기 쉽게 정리합니다.
        contents = getBody(soup, newURL)

        #script를 제거합니다.
        pStyle = re.compile('<style(.*?)>(.*?)</style>', re.IGNORECASE | re.MULTILINE | re.DOTALL )
        contents = pStyle.sub('', contents)
        pStyle = re.compile('<script(.*?)>(.*?)</script>', re.IGNORECASE | re.MULTILINE | re.DOTALL )
        contents = pStyle.sub('', contents)
        pStyle = re.compile("<(.*?)>", re.IGNORECASE | re.MULTILINE | re.DOTALL )
        contents = pStyle.sub('', contents)

        #txt file을 저장합니다.
        fTXT = open( mainpath + blog_id + '/' + article_id + '.txt', 'w')
        fTXT.write( title+'|n')
        fTXT.write(contents)
        fTXT.close()

        #처리했다고 db에 표시합니다.
        db.updateURL(url)

    else:
        print('Invalid blog article...')
        #해당 url이 유효하지 않은 경우 에러(-1)로 표시합니다.
        db.updateURL(url, -1)

class DB:
    "SQLITE3 wrapper class"
    def __init__(self):
        self.conn = sqlite3.connect('crawlerDB')
        self.cursor = self.conn.cursor()
        self.cursor.execute('CREATE TABLE IF NOT EXISTS urls(url text, state int)')
        self.cursor.execute('CREATE UNIQUE INDEX IF NOT EXISTS IDX001 ON urls(url)')
        self.cursor.execute('CREATE INDEX IF NOT EXISTS IDX002 ON urls(state)')

    def __del__(self):
        self.conn.commit()
        self.cursor.close()

    def insertURL(self, url, state=0):
        try:
            self.cursor.execute("INSERT INTO urls VALUES ('%s',%d)"%(url,state))
            self.conn.commit()
        except:
            return 0
        else:
            return 1

    def selectUncrawledURL(self):
        self.cursor.execute('SELECT * FROM urls where state=0')
        return [ row[0] for row in self.cursor.fetchall() ]

    def updateURL(self, url, state=1):
        self.cursor.execute("UPDATE urls SET state=%d WHERE url='%s'"%(state,url))

    def isCrawledURL(self, url):
        self.cursor.execute("SELECT COUNT(*) FROM urls WHERE url='%s'state=1"%url)
        ret = self.cursor.fetchone()
        return ret[0]

db = DB()

if __name__=='__main__':
    print('starting crawl.py...')

    #메인 페이지를 체크합니다.
    contents = getContent(mainpage)
    URLs = getArticleInfo( BeautifulSoup( contents ) )
    nSuccess = 0
    for u in URLs:
        nSuccess += db.insertURL(u)
    print('inserted %d new pages.'%nSuccess)

    while 1:
        for u in db.selectUncrawledURL():
            #아직 읽지 않은 url을 얻어서 처리합니다.
            print('downloading %s'%u)
            try:
                parseArticle( u )
            except:
                traceback.print_exc()
                db.updateURL( u, -1 ) 

You are generating incorrect SQL; you probably want a url=... AND state=1 (with a space and AND to match both criteria.

Also, you should not use string interpolation, use SQL parameters instead:

def isCrawledURL(self, url):
    self.cursor.execute("SELECT COUNT(*) FROM urls WHERE url=? AND state=1", (url,))
    ret = self.cursor.fetchone()
    return ret[0]

This applies to all your queries, like:

self.cursor.execute("INSERT INTO urls VALUES (?, ?)", (url,state))

and:

self.cursor.execute("UPDATE urls SET state=? WHERE url=?", (state,url))

Note that the parameters are passed into the cursor.execute() calls as a second argument (a sequence of values).

您在查询中的状态前缺少空格和AND关键字。

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM