Twisted / Python-逐行處理大文件

Question

使用此代碼讀取文件並進行處理。 該文件很大，有1200萬行，因此目前我將其手動拆分為1000行文件，並針對每1000行（bash腳本）按順序啟動每個進程。

有沒有一種方法可以使用Twisted加載文件並從一個文件中處理1000個項目（進度條會很好），而無需我手動拆分它？

掃描儀

import argparse

from tqdm import tqdm
from sys import argv
from pprint import pformat

from twisted.internet.task import react
from twisted.web.client import Agent, readBody
from twisted.web.http_headers import Headers

import lxml.html

from geoip import geolite2
import pycountry

from tld import get_tld
import json
import socket

poweredby = ""
server = ""
ip = ""


def cbRequest(response, url):
    global poweredby, server, ip
    # print 'Response version:', response.version
    # print 'Response code:', response.code
    # print 'Response phrase:', response.phrase
    # print 'Response headers:'
    # print pformat(list(response.headers.getAllRawHeaders()))
    poweredby = response.headers.getRawHeaders("X-Powered-By")[0]
    server = response.headers.getRawHeaders("Server")[0]

    #print poweredby
    #print server

    d = readBody(response)
    d.addCallback(cbBody, url)
    return d


def cbBody(body, ourl):
    global poweredby, server,ip

    #print body
    html_element = lxml.html.fromstring(body)
    generator = html_element.xpath("//meta[@name='generator']/@content")

    ip = socket.gethostbyname(ourl)

    try:
        match = geolite2.lookup(ip)
        if match is not None:
            country = match.country
            try:

                c = pycountry.countries.lookup(country)
                country = c.name
            except:
                country = ""

    except:
        country = ""
    try:
        res = get_tld("http://www" + ourl, as_object=True)
        tld = res.suffix
    except:
        tld = ""

    try:
        match = re.search(r'[\w\.-]+@[\w\.-]+', body)
        email = match.group(0)
    except:
        email = ""

    permalink=ourl.rstrip().replace(".","-")

    try:
        item = generator[0]
        val = "{ \"Domain\":" + json.dumps(
            "http://" + ourl.rstrip()) + ",\"IP\":\"" + ip + "\",\"Server\":" + json.dumps(
            str(server)) + ",\"PoweredBy\":" + json.dumps(
                str(poweredby)) + ",\"MetaGenerator\":" + json.dumps(item) + ",\"Email\":" + json.dumps(
                    email) + ",\"Suffix\":\"" + tld + "\",\"CountryHosted\":\"" + country+"\",\"permalink\":\""+permalink+"\" }"
    except:
        val = "{ \"Domain\":" + json.dumps(
            "http://" + ourl.rstrip()) + ",\"IP\":\"" + ip + "\"," + "\"Server\":" + json.dumps(
            str(server)) + ",\"PoweredBy\":" + json.dumps(
                str(poweredby)) + ",\"MetaGenerator\":\"\",\"Email\":" + json.dumps(
                    email) + ",\"Suffix\":\"" + tld + "\",\"CountryHosted\":\"" + country+"\",\"permalink\":\""+permalink+"\" }"


    print val

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Scanner v0.99')
    parser.add_argument(
        '-i', '--input', help='Input list of domains', required=True)
    args = parser.parse_args()
    input = args.input

with open(input) as f:
    urls = f.read().splitlines()


def mainjob(reactor, urls=urls):
    for url in tqdm(urls):
        agent = Agent(reactor)
        d = agent.request(
            'GET', "http://" + url,
            Headers({'User-Agent': ['bot']}),
            None)
        d.addCallback(cbRequest, url)
        d.addErrback(lambda x: None)  # ignore errors
    return d


react(mainjob, argv[3:])

更新1：

現在，我像這樣執行它：

file.txt-12,000,000行

chunk01.txt-具有1000行的文件。 。。

我為每個塊文件執行一個腳本。

python scanner.py chunk01.txt
python scanner.py chunk02.txt
.
.
.

想要執行一次腳本：

python scanner.py file.txt

問題在於，我需要將url作為參數傳遞給react（）。 如果我將它讀入內存（通過f.read（））為12,000,000個文件，則它太大。 因此，我分割了文件並在每個小文件上執行了腳本。

希望現在更加清晰...

更新2：

基於@ Jean-Paul Calderone的答案，我編寫了這段代碼。

它似乎可行，但是從那以后我一直感到震驚：

180,000次迭代...。我假設有180,000個域（輸入文件中的每一行），該腳本僅打印/輸出了ca。 35707行（條目）。 我希望它接近18萬...我知道某些域會超時。 當我以“舊”方式運行它時，它更加一致，數量更接近，即輸入域的數量接近輸出文件中的輸出行。

代碼中是否存在“不良”之處？ 有任何想法嗎？

python scanner.py > out.txt

181668it [1:47:36,  4.82it/s]

並計算行數：

wc -l out.txt
36840 out.txt

掃描儀

import argparse

from tqdm import tqdm
from sys import argv
from pprint import pformat

from twisted.internet.task import react
from twisted.web.client import Agent, readBody
from twisted.web.http_headers import Headers
from twisted.internet.task import cooperate
from twisted.internet.defer import gatherResults

import lxml.html

from geoip import geolite2
import pycountry

from tld import get_tld
import json
import socket

poweredby = ""
server = ""
ip = ""


def cbRequest(response, url):
    global poweredby, server, ip
    # print 'Response version:', response.version
    # print 'Response code:', response.code
    # print 'Response phrase:', response.phrase
    # print 'Response headers:'
    # print pformat(list(response.headers.getAllRawHeaders()))
    poweredby = response.headers.getRawHeaders("X-Powered-By")[0]
    server = response.headers.getRawHeaders("Server")[0]

    #print poweredby
    #print server

    d = readBody(response)
    d.addCallback(cbBody, url)
    return d


def cbBody(body, ourl):
    global poweredby, server,ip

    #print body
    html_element = lxml.html.fromstring(body)
    generator = html_element.xpath("//meta[@name='generator']/@content")

    ip = socket.gethostbyname(ourl)

    try:
        match = geolite2.lookup(ip)
        if match is not None:
            country = match.country
            try:

                c = pycountry.countries.lookup(country)
                country = c.name
            except:
                country = ""

    except:
        country = ""
    try:
        res = get_tld("http://www" + ourl, as_object=True)
        tld = res.suffix
    except:
        tld = ""

    try:
        match = re.search(r'[\w\.-]+@[\w\.-]+', body)
        email = match.group(0)
    except:
        email = ""

    permalink=ourl.rstrip().replace(".","-")

    try:
        item = generator[0]
        val = "{ \"Domain\":" + json.dumps(
            "http://" + ourl.rstrip()) + ",\"IP\":\"" + ip + "\",\"Server\":" + json.dumps(
            str(server)) + ",\"PoweredBy\":" + json.dumps(
                str(poweredby)) + ",\"MetaGenerator\":" + json.dumps(item) + ",\"Email\":" + json.dumps(
                    email) + ",\"Suffix\":\"" + tld + "\",\"CountryHosted\":\"" + country+"\",\"permalink\":\""+permalink+"\" }"
    except:
        val = "{ \"Domain\":" + json.dumps(
            "http://" + ourl.rstrip()) + ",\"IP\":\"" + ip + "\"," + "\"Server\":" + json.dumps(
            str(server)) + ",\"PoweredBy\":" + json.dumps(
                str(poweredby)) + ",\"MetaGenerator\":\"\",\"Email\":" + json.dumps(
                    email) + ",\"Suffix\":\"" + tld + "\",\"CountryHosted\":\"" + country+"\",\"permalink\":\""+permalink+"\" }"


    print val


def main(reactor, url_path):
    urls = open(url_path)
    return mainjob(reactor, (url.strip() for url in urls))

def mainjob(reactor, urls=argv[2:]):
    #for url in urls:
    #  print url
    agent = Agent(reactor)
    work = (process(agent, url) for url in tqdm(urls))
    tasks = list(cooperate(work) for i in range(100))
    return gatherResults(list(task.whenDone() for task in tasks))



def process(agent, url):
    d = agent.request(
        'GET', "http://" + url,
        Headers({'User-Agent': ['bot']}),
        None)
    d.addCallback(cbRequest, url)
    d.addErrback(lambda x: None)  # ignore errors
    return d

react(main, ["./domains.txt"])

更新3：

更新了代碼以將錯誤打印到errors.txt

import argparse

from tqdm import tqdm
from sys import argv
from pprint import pformat

from twisted.internet.task import react
from twisted.web.client import Agent, readBody
from twisted.web.http_headers import Headers
from twisted.internet.task import cooperate
from twisted.internet.defer import gatherResults

import lxml.html

from geoip import geolite2
import pycountry

from tld import get_tld
import json
import socket

poweredby = ""
server = ""
ip = ""

f = open("errors.txt", "w")


def error(response, url):
    f.write("Error: "+url+"\n") 


def cbRequest(response, url):
    global poweredby, server, ip
    # print 'Response version:', response.version
    # print 'Response code:', response.code
    # print 'Response phrase:', response.phrase
    # print 'Response headers:'
    # print pformat(list(response.headers.getAllRawHeaders()))
    poweredby = response.headers.getRawHeaders("X-Powered-By")[0]
    server = response.headers.getRawHeaders("Server")[0]

    #print poweredby
    #print server

    d = readBody(response)
    d.addCallback(cbBody, url)
    return d


def cbBody(body, ourl):
    global poweredby, server,ip

    #print body
    html_element = lxml.html.fromstring(body)
    generator = html_element.xpath("//meta[@name='generator']/@content")

    ip = socket.gethostbyname(ourl)

    try:
        match = geolite2.lookup(ip)
        if match is not None:
            country = match.country
            try:

                c = pycountry.countries.lookup(country)
                country = c.name
            except:
                country = ""

    except:
        country = ""
    try:
        res = get_tld("http://www" + ourl, as_object=True)
        tld = res.suffix
    except:
        tld = ""

    try:
        match = re.search(r'[\w\.-]+@[\w\.-]+', body)
        email = match.group(0)
    except:
        email = ""

    permalink=ourl.rstrip().replace(".","-")

    try:
        item = generator[0]
        val = "{ \"Domain\":" + json.dumps(
            "http://" + ourl.rstrip()) + ",\"IP\":\"" + ip + "\",\"Server\":" + json.dumps(
            str(server)) + ",\"PoweredBy\":" + json.dumps(
                str(poweredby)) + ",\"MetaGenerator\":" + json.dumps(item) + ",\"Email\":" + json.dumps(
                    email) + ",\"Suffix\":\"" + tld + "\",\"CountryHosted\":\"" + country+"\",\"permalink\":\""+permalink+"\" }"
    except:
        val = "{ \"Domain\":" + json.dumps(
            "http://" + ourl.rstrip()) + ",\"IP\":\"" + ip + "\"," + "\"Server\":" + json.dumps(
            str(server)) + ",\"PoweredBy\":" + json.dumps(
                str(poweredby)) + ",\"MetaGenerator\":\"\",\"Email\":" + json.dumps(
                    email) + ",\"Suffix\":\"" + tld + "\",\"CountryHosted\":\"" + country+"\",\"permalink\":\""+permalink+"\" }"


    print val


def main(reactor, url_path):
    urls = open(url_path)
    return mainjob(reactor, (url.strip() for url in urls))

def mainjob(reactor, urls=argv[2:]):
    #for url in urls:
    #  print url
    agent = Agent(reactor)
    work = (process(agent, url) for url in tqdm(urls))
    tasks = list(cooperate(work) for i in range(100))
    return gatherResults(list(task.whenDone() for task in tasks))



def process(agent, url):
    d = agent.request(
        'GET', "http://" + url,
        Headers({'User-Agent': ['crawler']}),
        None)
    d.addCallback(cbRequest, url)
    d.addErrback(error, url) 
    return d

react(main, ["./domains.txt"])

f.close()

更新4：

我使用Wireshark捕獲了流量，僅包含2個域，這些域以前出錯：

user@laptop:~/crawler$ python scanner.py 
2it [00:00, 840.71it/s]
user@laptop:~/crawler$ cat errors.txt 
Error: google.al
Error: fau.edu.al

如您所見，它們有錯誤，但是通過Wireshark，我看到了響應：

Answer 1

您需要為程序創建的並發數量增加一個限制。 當前，您要同時處理所有給定的URL-或嘗試至少：

def mainjob(reactor, urls=urls):
    for url in tqdm(urls):
        agent = Agent(reactor)
        d = agent.request(
            'GET', "http://" + url,
            Headers({'User-Agent': ['bot']}),
            None)
        d.addCallback(cbRequest, url)
        d.addErrback(lambda x: None)  # ignore errors
    return d

這將為每個URL發出請求，而無需等待它們中的任何一個完成。 而是使用twisted.internet.task.cooperate一次運行有限數量。 一次運行一個請求：

def mainjob(reactor, urls):
    agent = Agent(reactor)
    work = (process(agent, url) for url in tqdm(urls))
    task = cooperate(work)
    return task.whenDone()

def process(agent, url):
    d = agent.request(
        'GET', "http://" + url,
        Headers({'User-Agent': ['bot']}),
        None)
    d.addCallback(cbRequest, url)
    d.addErrback(lambda x: None)  # ignore errors
    return d

您可能還想要更多。 因此，再調用一次combinate（）：

def mainjob(reactor, urls=urls):
    agent = Agent(reactor)
    work = (process(agent, url) for url in tqdm(urls))
    tasks = list(cooperate(work) for i in range(100))
    return gatherResults(list(task.whenDone() for task in tasks))

一次最多運行100個請求。 每個任務從work提取下一個元素，然后等待它。 gatherResults等待所有100個任務完成。

現在，只需避免一次將完整的輸入加載到內存中：

def main(reactor, url_path):
    urls = open(url_path)
    return mainjob(reactor, (url.strip() for url in urls))

react(main, ["path-to-urls.txt"])

這將打開url文件，但僅在需要時才從其中讀取行。

Twisted / Python-逐行處理大文件

問題描述

1 個解決方案

解決方案1
1 已采納 2017-02-22 19:13:55

Twisted / Python-逐行處理大文件

問題描述

1 個解決方案

解決方案1 1 已采納 2017-02-22 19:13:55

解決方案1
1 已采納 2017-02-22 19:13:55