簡體   English   中英

python中的多線程搜尋器

[英]multi threaded crawler in python

我正在嘗試實現一個多路徑抓取工具,該抓取工具將使用初始網址並搜索該鏈接內的鏈接並顯示每個鏈接,同時在每個鏈接內查找鏈接

這是我的代碼

import urllib.request, re, threading, csv
from queue import Queue
from bs4 import BeautifulSoup
from sys import exit

class a3_6:

    __url_q = Queue(100)
    __html_q = Queue()
    __data_q = Queue()
    __visited_urls = []

    def __init__(self, start_url, max_threads):
        self.__url_q.put(start_url)
        self.max_threads = max_threads

    def gethtml(self,url):
        try:
            req=urllib.request.Request(url)
            html=urllib.request.urlopen(req).read()
            self.__html_q.put(html)
        except urllib.error.URLError as e:
            print(e.reason)
        except:
            print("invalid: " + url)
        self.__visited_urls.append(url)

    def mine_thread(self):
        while True:
            if not self.__html_q.empty():
                soup = BeautifulSoup(self.__html_q.get(),"html.parser")
                for a in soup.find_all('a', href=True):
                    if a not in self.__visited_urls:
                        link='https://en.wikipedia.org'+a.get('href')
                        self.__url_q.put(link)
                        self.__data_q.put(link)
            else:
                break

    def store(self):
        while True:
            if not self.__data_q.empty():
                print (self.__data_q.get())

    def download_thread(self):
        while True:
            if not self.__url_q.empty():
                self.gethtml(self.__url_q.get())
            else:
                break

    def run(self):
        self.download_thread()
        self.mine_thread()
        self.store()

    def op(self):
        for x in range(self.max_threads):
            t = threading.Thread(target=self.run)
            t.daemon = True
            t.start()
        self.store()


if __name__ == '__main__':
    a=a3_6('https://en.wikipedia.org/wiki/Main_Page', 5)
    a.op()

編輯:我編輯了代碼,現在我得到了正確的結果,但再次沒有結束。

我到達了解決方案。 我得到了詹姆斯·哈里森的幫助。 我不知道為什么他刪除了原來的解決方案,但這是

import urllib.request, threading
from queue import Queue
from bs4 import BeautifulSoup
from sys import exit
from a3_3 import store_to_db

class a3_5:

    __url_q = Queue(100)
    __html_q = Queue()
    __data_q = Queue()
    __visited_urls=[]

    def gethtml(self,url):
        try:
            req=urllib.request.Request(url)
            html=urllib.request.urlopen(req).read()
            self.__html_q.put(html)
            pars=urlparse(url)
        except urllib.error.URLError as e:
            print(e.reason+':'+url)
        except:
            print("invalid: " + url)

    def mine_thread(self):
        while True:
            if not self.__html_q.empty():
                soup = BeautifulSoup(self.__html_q.get(),"html.parser")
                for a in soup.find_all('a', href=True):
                    link=a.get('href')
                    """if not link.startswith('www'):
                        link=self.__prfx+link"""
                    if link not in self.__visited_urls:
                        self.__url_q.put(link)
                        self.__data_q.put(link)
            else:
                break

    def store(self):
        while True:
            if not self.__data_q.empty():
                cont=self.__data_q.get()
                print (cont)
            else:
                break

    def download_thread(self):
        while True:
            if not self.__url_q.empty():
                self.gethtml(self.__url_q.get())
                self.__url_q.task_done()

    def op(self,*urls):
        for x in range(25):
            d = threading.Thread(target=self.download_thread)
            d.setDaemon(True)
            d.start()
        for url in urls:
            self.__url_q.put(url)
        self.__url_q.join()
        self.mine_thread()
        self.store()

if __name__ == '__main__':
    urls=['https://en.wikipedia.org/wiki/Bajirao']#,'https://en.wikipedia.org/wiki/Malharrao_Holkar','https://en.wikipedia.org/wiki/Ranoji_Scindia']
    a=a3_5()
    a.op(*urls)

本質上,我必須安排另一個隊列,在該隊列中,我必須設置工人來激活線程。 同樣,mine_thread和store方法需要在download_thread方法完成之后啟動,因為不會存儲這些值。

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM