I hope you guys are best with your health and R&D work.
import webbrowser
import scrapy
from urllib.request import urlopen
import re
from scrapy.selector import Selector
class QuotesSpider(scrapy.Spider):
name = "forum"
def start_requests(self):
urls = ['https://tribune.com.pk/'], #'https://www.siasat.pk/forum/content.php/', 'http://hamariweb.com/news/', 'https://www.urdupoint.com/pakistan/all-news/']
for url in urls:
website = urlopen(url)
webbrowser.open(website)
print("HELLO WORLD")
html = website.read()
all_links = re.findall('"((http|ftp)s?://.*?)"', html)
for link in all_links:
yield scrapy.Request(url=link, callback=self.parse)
def parse(self, response):
page = response.url.split('/')[-2]
filename = '%s' % page
with open(filename, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
I want to open a webpage and that webpage contains many other links, I want to open all those and wants Scrapy to scrape all those web pages. Please help me out. Thanks in Advance.
I have tried with monsterindia.com and open page using scrapy, that page contain multiple links. I have scraped all the data in the respective link and also we can do pagination. The following code may useful.
class MonsterSpider(scrapy.Spider):
name = 'monster'
start_urls = ['http://jobsearch.monsterindia.com/searchresult.html?day=1&jbc=22']
item = BotItem()
count = 1
def parse(self, response):
for href in response.css('h2.seotitle > a::attr(href)'):
url = response.urljoin(href.extract())
yield scrapy.Request(url =url, callback = self.parse_details)
next_page_url = response.css('ul.pager').xpath('//a[contains(text(), "Next")]/@althref').extract_first()
print next_page_url
if next_page_url:
nextpage = response.css('ul.pager').xpath('//a[contains(text(), "Next")]/@onclick').extract_first()
searchresult_num = nextpage.split("'")[1].strip()
next_page_url = "http://jobsearch.monsterindia.com/searchresult.html?day=1&n="+searchresult_num
next_page_url = response.urljoin(next_page_url)
print next_page_url
yield scrapy.Request(url = next_page_url, callback = self.parse)
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.