简体   繁体   中英

Converting rails task to rake

I currently have this file in my models/ folder:

class Show < ActiveRecord::Base
  require 'nokogiri'
  require 'open-uri'

  has_many :user_shows
  has_many :users, through: :user_shows

  def self.update_all_screenings
    Show.all.each do |show|
        show.update_attribute(:next_screening, Show.update_next_screening(show.url))
    end
  end

  def self.update_next_screening(url)
    nextep = Nokogiri::HTML(open(url))
    ## Finds the title of the show and extracts the date of the show and converts to string ##
    begin

        title = nextep.at_css('h1').text
        date = nextep.at_css('.next_episode .highlight_date').text[/\d{1,2}\/\d{1,2}\/\d{4}/]
        date = date.to_s

    ## Because if it airs today it won't have a date rather a time this checks whether or not 
    ## there is a date. If there is it will remain, if not it will insert todays date
    ## plus get the time that the show is airing    
        if date =~ /\d{1,2}\/\d{1,2}\/\d{4}/
            showtime = DateTime.strptime(date, "%m/%d/%Y")
        else
            date = DateTime.now.strftime("%D")
            time = nextep.at_css('.next_episode .highlight_date').text[/\dPM|\dAM/]
            time = time.to_s
            showtime = date + " " + time
            showtime = DateTime.strptime(showtime, "%m/%d/%y %l%p")

        end

        return showtime

    rescue
        return nil
    end
  end
end

However, when I run

Show.update_all_screenings

It takes ages to do. I currently have a very similar script that is a rake file that has to do twice the amount of scraping and manages to do it in about 10 minute where as this one will take 8 hours as is. So I was wondering how I would go about converting this file to a rake task? The whole app I'm building depends on this being able to do it in at most 1 hours.

Here is the other script for reference:

require 'mechanize'

namespace :show  do

  desc "add tv shows from web into database"
  task :scrape => :environment do
    puts 'scraping...'

    Show.delete_all

agent = Mechanize.new
agent.get 'http://www.tv.com/shows/sort/a_z/'
agent.page.search('//div[@class="alphabet"]//li[not(contains(@class, "selected"))]/a').each do |letter_link|
  agent.get letter_link[:href]
  letter = letter_link.text.upcase
  agent.page.search('//li[@class="show"]/a').map do |show_link| 
    Show.create(title: show_link.text, url:'http://tv.com' + show_link[:href].to_s + 'episodes/')
  end
  while next_page_link = agent.page.at('//div[@class="_pagination"]//a[@class="next"]') do
    agent.get next_page_link[:href]
    agent.page.search('//li[@class="show"]/a').map do |show_link|
      Show.create(title: show_link.text, url:'http://tv.com' + show_link[:href].to_s + 'episodes/')
  end
  end
end

end
end

Rake is no magic bullet - it will not run your code any faster.

What you could do is run your code more efficiently. The main time-consumer in your code is iteratively calling open(url) . If you could read all the urls concurrently , the whole process should take fraction of the time it takes now.

You could use typhoeus gem (or some other gem) to handle this for you.

--Danger! Untested code ahead!--

I have no experience using this gem, but your code could look something like this:

require 'nokogiri'
require 'open-uri'
require 'typhoeus'

class Show < ActiveRecord::Base


  has_many :user_shows
  has_many :users, through: :user_shows

  def self.update_all_screenings
    hydra = Typhoeus::Hydra.hydra
    Show.all.each do |show|
      request = Typhoeus::Request.new(show.url, followlocation: true)
      request.on_complete do |response|
        show.update_attribute(:next_screening, Show.update_next_screening(response.body))
      end
      hydra.queue(request)
    end
    hydra.run
  end

  def self.update_next_screening(body)
    nextep = Nokogiri::HTML(body)
    ## Finds the title of the show and extracts the date of the show and converts to string ##
    begin

        title = nextep.at_css('h1').text
        date = nextep.at_css('.next_episode .highlight_date').text[/\d{1,2}\/\d{1,2}\/\d{4}/]
        date = date.to_s

    ## Because if it airs today it won't have a date rather a time this checks whether or not 
    ## there is a date. If there is it will remain, if not it will insert todays date
    ## plus get the time that the show is airing    
        if date =~ /\d{1,2}\/\d{1,2}\/\d{4}/
            showtime = DateTime.strptime(date, "%m/%d/%Y")
        else
            date = DateTime.now.strftime("%D")
            time = nextep.at_css('.next_episode .highlight_date').text[/\dPM|\dAM/]
            time = time.to_s
            showtime = date + " " + time
            showtime = DateTime.strptime(showtime, "%m/%d/%y %l%p")

        end

        return showtime

    rescue
        return nil
    end
  end
end

The above should collect all the requests in one queue, and run them concurrently, acting on any response as it comes.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM