简体   繁体   中英

multi threaded crawler in python

I am trying to implement a multihtreaded crawler that takes an initial url and searches for links within that link and displays each links and at the same time look for links within each link

This is my code

import urllib.request, re, threading, csv
from queue import Queue
from bs4 import BeautifulSoup
from sys import exit

class a3_6:

    __url_q = Queue(100)
    __html_q = Queue()
    __data_q = Queue()
    __visited_urls = []

    def __init__(self, start_url, max_threads):
        self.__url_q.put(start_url)
        self.max_threads = max_threads

    def gethtml(self,url):
        try:
            req=urllib.request.Request(url)
            html=urllib.request.urlopen(req).read()
            self.__html_q.put(html)
        except urllib.error.URLError as e:
            print(e.reason)
        except:
            print("invalid: " + url)
        self.__visited_urls.append(url)

    def mine_thread(self):
        while True:
            if not self.__html_q.empty():
                soup = BeautifulSoup(self.__html_q.get(),"html.parser")
                for a in soup.find_all('a', href=True):
                    if a not in self.__visited_urls:
                        link='https://en.wikipedia.org'+a.get('href')
                        self.__url_q.put(link)
                        self.__data_q.put(link)
            else:
                break

    def store(self):
        while True:
            if not self.__data_q.empty():
                print (self.__data_q.get())

    def download_thread(self):
        while True:
            if not self.__url_q.empty():
                self.gethtml(self.__url_q.get())
            else:
                break

    def run(self):
        self.download_thread()
        self.mine_thread()
        self.store()

    def op(self):
        for x in range(self.max_threads):
            t = threading.Thread(target=self.run)
            t.daemon = True
            t.start()
        self.store()


if __name__ == '__main__':
    a=a3_6('https://en.wikipedia.org/wiki/Main_Page', 5)
    a.op()

EDIT: I edited the code and now I am getting proper results but again not ending.

I arrived at the solution. I took James Harrison's help. i don't know why he deleted his original solution but here it is

import urllib.request, threading
from queue import Queue
from bs4 import BeautifulSoup
from sys import exit
from a3_3 import store_to_db

class a3_5:

    __url_q = Queue(100)
    __html_q = Queue()
    __data_q = Queue()
    __visited_urls=[]

    def gethtml(self,url):
        try:
            req=urllib.request.Request(url)
            html=urllib.request.urlopen(req).read()
            self.__html_q.put(html)
            pars=urlparse(url)
        except urllib.error.URLError as e:
            print(e.reason+':'+url)
        except:
            print("invalid: " + url)

    def mine_thread(self):
        while True:
            if not self.__html_q.empty():
                soup = BeautifulSoup(self.__html_q.get(),"html.parser")
                for a in soup.find_all('a', href=True):
                    link=a.get('href')
                    """if not link.startswith('www'):
                        link=self.__prfx+link"""
                    if link not in self.__visited_urls:
                        self.__url_q.put(link)
                        self.__data_q.put(link)
            else:
                break

    def store(self):
        while True:
            if not self.__data_q.empty():
                cont=self.__data_q.get()
                print (cont)
            else:
                break

    def download_thread(self):
        while True:
            if not self.__url_q.empty():
                self.gethtml(self.__url_q.get())
                self.__url_q.task_done()

    def op(self,*urls):
        for x in range(25):
            d = threading.Thread(target=self.download_thread)
            d.setDaemon(True)
            d.start()
        for url in urls:
            self.__url_q.put(url)
        self.__url_q.join()
        self.mine_thread()
        self.store()

if __name__ == '__main__':
    urls=['https://en.wikipedia.org/wiki/Bajirao']#,'https://en.wikipedia.org/wiki/Malharrao_Holkar','https://en.wikipedia.org/wiki/Ranoji_Scindia']
    a=a3_5()
    a.op(*urls)

Essentially I had to arrange another queue where I had to set the workers to activate the threads. Also, the mine_thread and store methods needed to start after the completion of download_thread method, because the values wouldn't get stored.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM