[英]Twisted/Python - processing a large file line by line
使用此代碼讀取文件並進行處理。 該文件很大,有1200萬行,因此目前我將其手動拆分為1000行文件,並針對每1000行(bash腳本)按順序啟動每個進程。
有沒有一種方法可以使用Twisted加載文件並從一個文件中處理1000個項目(進度條會很好),而無需我手動拆分它?
掃描儀
import argparse
from tqdm import tqdm
from sys import argv
from pprint import pformat
from twisted.internet.task import react
from twisted.web.client import Agent, readBody
from twisted.web.http_headers import Headers
import lxml.html
from geoip import geolite2
import pycountry
from tld import get_tld
import json
import socket
poweredby = ""
server = ""
ip = ""
def cbRequest(response, url):
global poweredby, server, ip
# print 'Response version:', response.version
# print 'Response code:', response.code
# print 'Response phrase:', response.phrase
# print 'Response headers:'
# print pformat(list(response.headers.getAllRawHeaders()))
poweredby = response.headers.getRawHeaders("X-Powered-By")[0]
server = response.headers.getRawHeaders("Server")[0]
#print poweredby
#print server
d = readBody(response)
d.addCallback(cbBody, url)
return d
def cbBody(body, ourl):
global poweredby, server,ip
#print body
html_element = lxml.html.fromstring(body)
generator = html_element.xpath("//meta[@name='generator']/@content")
ip = socket.gethostbyname(ourl)
try:
match = geolite2.lookup(ip)
if match is not None:
country = match.country
try:
c = pycountry.countries.lookup(country)
country = c.name
except:
country = ""
except:
country = ""
try:
res = get_tld("http://www" + ourl, as_object=True)
tld = res.suffix
except:
tld = ""
try:
match = re.search(r'[\w\.-]+@[\w\.-]+', body)
email = match.group(0)
except:
email = ""
permalink=ourl.rstrip().replace(".","-")
try:
item = generator[0]
val = "{ \"Domain\":" + json.dumps(
"http://" + ourl.rstrip()) + ",\"IP\":\"" + ip + "\",\"Server\":" + json.dumps(
str(server)) + ",\"PoweredBy\":" + json.dumps(
str(poweredby)) + ",\"MetaGenerator\":" + json.dumps(item) + ",\"Email\":" + json.dumps(
email) + ",\"Suffix\":\"" + tld + "\",\"CountryHosted\":\"" + country+"\",\"permalink\":\""+permalink+"\" }"
except:
val = "{ \"Domain\":" + json.dumps(
"http://" + ourl.rstrip()) + ",\"IP\":\"" + ip + "\"," + "\"Server\":" + json.dumps(
str(server)) + ",\"PoweredBy\":" + json.dumps(
str(poweredby)) + ",\"MetaGenerator\":\"\",\"Email\":" + json.dumps(
email) + ",\"Suffix\":\"" + tld + "\",\"CountryHosted\":\"" + country+"\",\"permalink\":\""+permalink+"\" }"
print val
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Scanner v0.99')
parser.add_argument(
'-i', '--input', help='Input list of domains', required=True)
args = parser.parse_args()
input = args.input
with open(input) as f:
urls = f.read().splitlines()
def mainjob(reactor, urls=urls):
for url in tqdm(urls):
agent = Agent(reactor)
d = agent.request(
'GET', "http://" + url,
Headers({'User-Agent': ['bot']}),
None)
d.addCallback(cbRequest, url)
d.addErrback(lambda x: None) # ignore errors
return d
react(mainjob, argv[3:])
更新1:
現在,我像這樣執行它:
file.txt-12,000,000行
chunk01.txt-具有1000行的文件。 。 。
我為每個塊文件執行一個腳本。
python scanner.py chunk01.txt
python scanner.py chunk02.txt
.
.
.
想要執行一次腳本:
python scanner.py file.txt
問題在於,我需要將url作為參數傳遞給react()。 如果我將它讀入內存(通過f.read())為12,000,000個文件,則它太大。 因此,我分割了文件並在每個小文件上執行了腳本。
希望現在更加清晰...
更新2:
基於@ Jean-Paul Calderone的答案,我編寫了這段代碼。
它似乎可行,但是從那以后我一直感到震驚:
180,000次迭代...。我假設有180,000個域(輸入文件中的每一行),該腳本僅打印/輸出了ca。 35707行(條目)。 我希望它接近18萬...我知道某些域會超時。 當我以“舊”方式運行它時,它更加一致,數量更接近,即輸入域的數量接近輸出文件中的輸出行。
代碼中是否存在“不良”之處? 有任何想法嗎?
python scanner.py > out.txt
181668it [1:47:36, 4.82it/s]
並計算行數:
wc -l out.txt
36840 out.txt
掃描儀
import argparse
from tqdm import tqdm
from sys import argv
from pprint import pformat
from twisted.internet.task import react
from twisted.web.client import Agent, readBody
from twisted.web.http_headers import Headers
from twisted.internet.task import cooperate
from twisted.internet.defer import gatherResults
import lxml.html
from geoip import geolite2
import pycountry
from tld import get_tld
import json
import socket
poweredby = ""
server = ""
ip = ""
def cbRequest(response, url):
global poweredby, server, ip
# print 'Response version:', response.version
# print 'Response code:', response.code
# print 'Response phrase:', response.phrase
# print 'Response headers:'
# print pformat(list(response.headers.getAllRawHeaders()))
poweredby = response.headers.getRawHeaders("X-Powered-By")[0]
server = response.headers.getRawHeaders("Server")[0]
#print poweredby
#print server
d = readBody(response)
d.addCallback(cbBody, url)
return d
def cbBody(body, ourl):
global poweredby, server,ip
#print body
html_element = lxml.html.fromstring(body)
generator = html_element.xpath("//meta[@name='generator']/@content")
ip = socket.gethostbyname(ourl)
try:
match = geolite2.lookup(ip)
if match is not None:
country = match.country
try:
c = pycountry.countries.lookup(country)
country = c.name
except:
country = ""
except:
country = ""
try:
res = get_tld("http://www" + ourl, as_object=True)
tld = res.suffix
except:
tld = ""
try:
match = re.search(r'[\w\.-]+@[\w\.-]+', body)
email = match.group(0)
except:
email = ""
permalink=ourl.rstrip().replace(".","-")
try:
item = generator[0]
val = "{ \"Domain\":" + json.dumps(
"http://" + ourl.rstrip()) + ",\"IP\":\"" + ip + "\",\"Server\":" + json.dumps(
str(server)) + ",\"PoweredBy\":" + json.dumps(
str(poweredby)) + ",\"MetaGenerator\":" + json.dumps(item) + ",\"Email\":" + json.dumps(
email) + ",\"Suffix\":\"" + tld + "\",\"CountryHosted\":\"" + country+"\",\"permalink\":\""+permalink+"\" }"
except:
val = "{ \"Domain\":" + json.dumps(
"http://" + ourl.rstrip()) + ",\"IP\":\"" + ip + "\"," + "\"Server\":" + json.dumps(
str(server)) + ",\"PoweredBy\":" + json.dumps(
str(poweredby)) + ",\"MetaGenerator\":\"\",\"Email\":" + json.dumps(
email) + ",\"Suffix\":\"" + tld + "\",\"CountryHosted\":\"" + country+"\",\"permalink\":\""+permalink+"\" }"
print val
def main(reactor, url_path):
urls = open(url_path)
return mainjob(reactor, (url.strip() for url in urls))
def mainjob(reactor, urls=argv[2:]):
#for url in urls:
# print url
agent = Agent(reactor)
work = (process(agent, url) for url in tqdm(urls))
tasks = list(cooperate(work) for i in range(100))
return gatherResults(list(task.whenDone() for task in tasks))
def process(agent, url):
d = agent.request(
'GET', "http://" + url,
Headers({'User-Agent': ['bot']}),
None)
d.addCallback(cbRequest, url)
d.addErrback(lambda x: None) # ignore errors
return d
react(main, ["./domains.txt"])
更新3:
更新了代碼以將錯誤打印到errors.txt
import argparse
from tqdm import tqdm
from sys import argv
from pprint import pformat
from twisted.internet.task import react
from twisted.web.client import Agent, readBody
from twisted.web.http_headers import Headers
from twisted.internet.task import cooperate
from twisted.internet.defer import gatherResults
import lxml.html
from geoip import geolite2
import pycountry
from tld import get_tld
import json
import socket
poweredby = ""
server = ""
ip = ""
f = open("errors.txt", "w")
def error(response, url):
f.write("Error: "+url+"\n")
def cbRequest(response, url):
global poweredby, server, ip
# print 'Response version:', response.version
# print 'Response code:', response.code
# print 'Response phrase:', response.phrase
# print 'Response headers:'
# print pformat(list(response.headers.getAllRawHeaders()))
poweredby = response.headers.getRawHeaders("X-Powered-By")[0]
server = response.headers.getRawHeaders("Server")[0]
#print poweredby
#print server
d = readBody(response)
d.addCallback(cbBody, url)
return d
def cbBody(body, ourl):
global poweredby, server,ip
#print body
html_element = lxml.html.fromstring(body)
generator = html_element.xpath("//meta[@name='generator']/@content")
ip = socket.gethostbyname(ourl)
try:
match = geolite2.lookup(ip)
if match is not None:
country = match.country
try:
c = pycountry.countries.lookup(country)
country = c.name
except:
country = ""
except:
country = ""
try:
res = get_tld("http://www" + ourl, as_object=True)
tld = res.suffix
except:
tld = ""
try:
match = re.search(r'[\w\.-]+@[\w\.-]+', body)
email = match.group(0)
except:
email = ""
permalink=ourl.rstrip().replace(".","-")
try:
item = generator[0]
val = "{ \"Domain\":" + json.dumps(
"http://" + ourl.rstrip()) + ",\"IP\":\"" + ip + "\",\"Server\":" + json.dumps(
str(server)) + ",\"PoweredBy\":" + json.dumps(
str(poweredby)) + ",\"MetaGenerator\":" + json.dumps(item) + ",\"Email\":" + json.dumps(
email) + ",\"Suffix\":\"" + tld + "\",\"CountryHosted\":\"" + country+"\",\"permalink\":\""+permalink+"\" }"
except:
val = "{ \"Domain\":" + json.dumps(
"http://" + ourl.rstrip()) + ",\"IP\":\"" + ip + "\"," + "\"Server\":" + json.dumps(
str(server)) + ",\"PoweredBy\":" + json.dumps(
str(poweredby)) + ",\"MetaGenerator\":\"\",\"Email\":" + json.dumps(
email) + ",\"Suffix\":\"" + tld + "\",\"CountryHosted\":\"" + country+"\",\"permalink\":\""+permalink+"\" }"
print val
def main(reactor, url_path):
urls = open(url_path)
return mainjob(reactor, (url.strip() for url in urls))
def mainjob(reactor, urls=argv[2:]):
#for url in urls:
# print url
agent = Agent(reactor)
work = (process(agent, url) for url in tqdm(urls))
tasks = list(cooperate(work) for i in range(100))
return gatherResults(list(task.whenDone() for task in tasks))
def process(agent, url):
d = agent.request(
'GET', "http://" + url,
Headers({'User-Agent': ['crawler']}),
None)
d.addCallback(cbRequest, url)
d.addErrback(error, url)
return d
react(main, ["./domains.txt"])
f.close()
更新4:
我使用Wireshark捕獲了流量,僅包含2個域,這些域以前出錯:
user@laptop:~/crawler$ python scanner.py
2it [00:00, 840.71it/s]
user@laptop:~/crawler$ cat errors.txt
Error: google.al
Error: fau.edu.al
如您所見,它們有錯誤,但是通過Wireshark,我看到了響應:
您需要為程序創建的並發數量增加一個限制。 當前,您要同時處理所有給定的URL-或嘗試至少:
def mainjob(reactor, urls=urls):
for url in tqdm(urls):
agent = Agent(reactor)
d = agent.request(
'GET', "http://" + url,
Headers({'User-Agent': ['bot']}),
None)
d.addCallback(cbRequest, url)
d.addErrback(lambda x: None) # ignore errors
return d
這將為每個URL發出請求,而無需等待它們中的任何一個完成。 而是使用twisted.internet.task.cooperate
一次運行有限數量。 一次運行一個請求:
def mainjob(reactor, urls):
agent = Agent(reactor)
work = (process(agent, url) for url in tqdm(urls))
task = cooperate(work)
return task.whenDone()
def process(agent, url):
d = agent.request(
'GET', "http://" + url,
Headers({'User-Agent': ['bot']}),
None)
d.addCallback(cbRequest, url)
d.addErrback(lambda x: None) # ignore errors
return d
您可能還想要更多。 因此,再調用一次combinate():
def mainjob(reactor, urls=urls):
agent = Agent(reactor)
work = (process(agent, url) for url in tqdm(urls))
tasks = list(cooperate(work) for i in range(100))
return gatherResults(list(task.whenDone() for task in tasks))
一次最多運行100個請求。 每個任務從work
提取下一個元素,然后等待它。 gatherResults
等待所有100個任務完成。
現在,只需避免一次將完整的輸入加載到內存中:
def main(reactor, url_path):
urls = open(url_path)
return mainjob(reactor, (url.strip() for url in urls))
react(main, ["path-to-urls.txt"])
這將打開url文件,但僅在需要時才從其中讀取行。
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.