繁体   English   中英

在python中进行递归网络爬网

[英]recursive web crawling in python

这是我的代码:

import requests
from bs4 import BeautifulSoup
import re

class WebCrawler():
    def check(self, links):
        global imgCount
        for item in links:
            targetURL = item['href']
            if(targetURL.startswith('/')):
                targetURL = target + targetURL  # add http:// and hostname to url

            target_html = requests.get(targetURL)
            parsed_html = BeautifulSoup(target_html.text, 'html.parser')
            if parsed_html.title.text not in pages:
                pages.append(parsed_html.title.text)
                print "[+] Collecting images page : " + parsed_html.title.text
                images = parsed_html.findAll('img', {'src': re.compile(r'(jpe?g)|(png)|(svg)$')})
                for img_url in images:
                   imgCount=imgCount + 1
                    # print img_url['src'] + ':::::::::' + img_url.get('alt', "") + "\n"
pages = []
imgCount = 0
target = raw_input("Please enter base url: ")

data = BeautifulSoup(requests.get(target).text, 'html.parser')

link = data.find_all('a')
crawler = WebCrawler()
crawler.check(link)

print "===================== Total Collected Images =====================\n"
print imgCount

我想在其他页面继续。 表示它将继续计数,直到没有任何链接为止。 当我调用检查功能时,那是行不通的!

import requests
from bs4 import BeautifulSoup
import re

class WebCrawler():
    def check(self, links):
        global imgCount
        for item in links:
            targetURL = item['href']
            if(targetURL.startswith('/')):
                targetURL = target + targetURL  # add http:// and hostname to url

            target_html = requests.get(targetURL)
            parsed_html = BeautifulSoup(target_html.text, 'html.parser')
            if parsed_html.title.text not in pages:
                pages.append(parsed_html.title.text)
                print "[+] Collecting images page : " + parsed_html.title.text
                images = parsed_html.findAll('img', {'src': re.compile(r'(jpe?g)|(png)|(svg)$')})
                for img_url in images:
                   imgCount=imgCount + 1
                    # print img_url['src'] + ':::::::::' + img_url.get('alt', "") + "\n"
            lnks = parsed_html.find_all('a')
            self.check(lnks)


pages = []
imgCount = 0
target = raw_input("Please enter base url: ")

data = BeautifulSoup(requests.get(target).text, 'html.parser')

link = data.find_all('a')
crawler = WebCrawler()
crawler.check(link)

print "===================== Total Collected Images =====================\n"
print imgCount  

我添加了以下几行:

lnks = parsed_html.find_all('a')
self.check(lnks)

这次,循环仅执行一次!

尝试这样的事情:

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from craigslist_sample.items import CraigslistSampleItem

class MySpider(CrawlSpider):
    name = "craigs"
    allowed_domains = ["sfbay.craigslist.org"]
    start_urls = ["http://sfbay.craigslist.org/search/npo"]

    rules = (
        Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('//a[@class="button next"]',)), callback="parse_items", follow= True),
    )

    def parse_items(self, response):
        hxs = HtmlXPathSelector(response)
        titles = hxs.xpath('//span[@class="pl"]')
        items = []
        for titles in titles:
            item = CraigslistSampleItem()
            item["title"] = titles.xpath("a/text()").extract()
            item["link"] = titles.xpath("a/@href").extract()
            items.append(item)
        return(items)

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM