[英]run rake task after csv parse finishes
我有一個看起來像這樣的耙子任務
require 'open-uri'
require 'csv'
namespace :tm do
task reload: :environment do
gzipped = open('csv link')
csv_text = Zlib::GzipReader.new(gzipped).read
csv = CSV.parse(csv_text, headers: true)
csv.each do |row|
if row[4] == 'logo url'
else
tmdate = Date.parse(row[10]).strftime('%Y-%m-%d')
viatmdate = Date.parse(row[10]).strftime('%d/%m/%Y')
swtmdate = row[10]
tmlocation = row[6].split('at ')[1]
place = row[11].split('|')[1]
place1 = row[11].split('|')[2]
place2 = row[11].split('|')[3]
location = '' + place + ', ' + place1 + ', ' + place2 + ''
tmtime = row[9]
text = row[7].gsub('text', '')
if text.include? '��'
eventname = text.gsub('��', 'e')
else
eventname = text.gsub(/[ªÀÈÌÒÙàèìòùÁÉÍÓÚáéíóúÂÊÎÔÛâêîôûÃÑÕãñõÄËÏÖÜŸäëïöüÿ]/, '')
end
if text.include? '��'
tmname = text.gsub('��', 'e')
else
tmname = text.gsub(/[ªÀÈÌÒÙàèìòùÁÉÍÓÚáéíóúÂÊÎÔÛâêîôûÃÑÕãñõÄËÏÖÜŸäëïöüÿ]/, '')
end
if text.include? ' -'
tmnamesplit = text.split(' -')[0]
end
if tmname[/[^0-9]/].present?
tmnamenn = tmname.gsub(/[^0-9]/i, '')
end
text2urldb = text2.where('eventtitle ILIKE ? AND eventdoortime = ? ', "%#{tmname.gsub(/[\-\:\ ]/, '%')}%", tmdate.to_s).first
text3urldb = text3.where('product_name ILIKE ? AND delivery_time = ? AND valid_from = ?', "%#{tmname}%", tmtime.to_s, tmdate.to_s).first
text1urldb = text1.where('product_name ILIKE ? AND specifications = ? AND promotional_text = ?', "%#{tmname}%", viatmdate.to_s, "%#{place}%").first
if tmnamesplit.present?
if text1urldb.blank?
text1urldb = text1.where('product_name ILIKE ? AND specifications = ?', "%#{tmnamesplit}%", viatmdate.to_s).first
end
if text3urldb.blank?
text3urldb = text3.where('product_name ILIKE ? AND delivery_time = ? AND valid_from = ?', "%#{tmnamesplit}%", tmtime.to_s, tmdate.to_s).first
end
end
if text1urldb.blank?
text1urldb = text1.where('product_name ILIKE ? AND specifications = ? AND promotional_text = ?', "%#{tmname}%", viatmdate.to_s, "%#{location}%").first
if text1urldb.blank?
text1urldb = text1.where('product_name ILIKE ? AND specifications = ?', "%#{tmname}%", viatmdate.to_s).first
end
if text1urldb.blank?
text1urldb = text1.where('product_name ILIKE ? AND specifications = ? AND promotional_text = ?', "%#{tmname}%", viatmdate.to_s, "%#{tmlocation}%").first
end
end
if text1urldb.present?
vurl = text1urldb.merchant_deep_link
txt = vurl
re1 = '.*?' # Non-greedy match on filler
re2 = '(?:[a-z][a-z]+)' # Uninteresting: word
re3 = '.*?' # Non-greedy match on filler
re4 = '(?:[a-z][a-z]+)' # Uninteresting: word
re5 = '.*?' # Non-greedy match on filler
re6 = '(?:[a-z][a-z]+)' # Uninteresting: word
re7 = '.*?' # Non-greedy match on filler
re8 = '(?:[a-z][a-z]+)' # Uninteresting: word
re9 = '.*?' # Non-greedy match on filler
re10 = '(?:[a-z][a-z]+)' # Uninteresting: word
re11 = '.*?' # Non-greedy match on filler
re12 = '((?:[a-z][a-z]+))' # Word 1
re = (re1 + re2 + re3 + re4 + re5 + re6 + re7 + re8 + re9 + re10 + re11 + re12)
m = Regexp.new(re, Regexp::IGNORECASE)
if m.match(txt)
word1 = m.match(txt)[1]
end
end
gmiurl = text3urldb.merchant_deep_link if text3urldb.present?
gigurl = text2urldb.eventurl if text2urldb.present?
api = HTTParty.get(URI.encode('text url' + tmname + '&when_from=' + swtmdate)).parsed_response
api1 = api['Paging']
api2 = api1['TotalResultCount']
if api1.blank?
newapi = HTTParty.get(URI.encode('texturl' + tmnamenn + '&when_from=' + swtmdate)).parsed_response
paging = newapi['Paging']
api2 = paging['TotalResultCount']
if newapi.blank?
apisplit = HTTParty.get(URI.encode('texturl' + tmnamesplit + '&when_from=' + swtmdate)).parsed_response
pagingsplit = apisplit['Paging']
api2 = pagingsplit['TotalResultCount']
end
end
text1 = vurl
text3 = gmiurl
text2 = gigurl
if api2 == 0
else
swurl = api['Events'].first['SwURL']
end
event = Event.find_by(time: row[9], date: row[10], eventname: eventname, eventvenuename: location)
if event.present?
event.update(event_type: word1, text: row[8], eventimage: row[4], textlink: swurl, text1link: text1, text3url: text3, text2url: text2)
else
Event.create(time: row[9], date: row[10], event_type: word1, text: row[8], eventimage: row[4], eventname: eventname, eventvenuename: location, textlink: swurl, text1link: text1, text3url: text3, text2url: text2)
end
end
end
end
end
現在這要花很長時間才能運行(大約2-3小時),所以我想知道是否將其拆分會加快速度?
如果我要從這里取出文本1 2和3,那么它只是將其首先放入數據庫中。 然后在完全填充后再運行另一個rake任務? 希望這應該加快速度? 因此,如果我有一個名為tm:reload的rake任務,然后又在另一個名為BA:reload的文件中有另一個任務,我該怎么做?
我已經瀏覽並索引了幾列,但那並沒有太大改善。
我會執行以下操作:
假設:文件中沒有重復的記錄,並且文件不能以任何順序處理
我認為數據庫不是瓶頸。 您發出的HTTP請求更有可能。 在您費力地添加索引之前,我將做一些基准測試以了解花在哪里的時間。
更新:像這樣(我沒有運行代碼)
...
csv.each do |row|
ImportEventJob.perform_later row
end
ActiveJob是Rails(后台)作業API。 您可以配置一個后端,我喜歡Sidekiq,但是其他也可以。 詳細信息請參見:
class ImportEventJob < ActiveJob::Base
def perform(row)
event_csv = EventCsv.new(row)
event = Event.find_by(time: event_csv.time, date: event_csv.date, ...)
if event
...
else
...
end
end
end
class EventCsv
RE_WITH_DIACRITICS = /[ªÀÈÌÒÙàèìòùÁÉÍÓÚáéíóúÂÊÎÔÛâêîôûÃÑÕãñõÄËÏÖÜŸäëïöüÿ]/
def initialize(row)
@row = row
end
def time
@row[9]
end
...
def name
text = @row[7].gsub('text', '')
if text.include? '��'
text.gsub('��', 'e')
else
text.gsub(RE_WITH_DIACRITICS, '')
end
end
end
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.