How to use threading to speed up my function?

Question

I'm on working on my tool.

So I have this function:

import subprocess, os, platform, ctypes, requests, random, threading
from bs4 import BeautifulSoup as bs

temptotal = 0
totalurl = 0 
retry = 0
load = 0
load2 = 0
loaded = 0
dorksdone = 0
tempourl = 0

#Import Proxy List
selecting = 1
while selecting == 1:
    try:
        option = int(input("Choose Type Proxy(1 = http, 2=socks4, 3 = socks5) :")
    except:
        option = 404
 
    if option == 1:
        selecting = 0
        prox = 'http'
        proxyyyy = 'http'
    elif option == 2:
        selecting = 0
        prox = 'socks4'
        proxyyyy = 'socks4'
    elif option == 3:
        selecting = 0
        prox = 'socks5'
        proxyyyy = 'socks5'
    else:
        print("Choose valid numbre such as 1, 2 or 3!") 
proxy_list = input("Give me Proxylist :" )
with open(proxy_list, mode="r", encoding="utf-8") as mf:
    for line in mf:
        load2 += 1
print(" ")
print("Total Proxy loaded :"  + str(load2))
print(" ")

#import keywordfile
dorkslist = input("Give me KeywordList/Dorklist :" + bcolors.ENDC + " ")
with open(dorkslist, mode="r", encoding="utf-8") as mf:
    for line in mf:
        load += 1
    mf.close()
    
print(" ")
print("Total Dorks loaded:" + str(load)) 
print(" ")

#define url to check
yahoourl = {"https://fr.search.yahoo.com/search?p=&fr=yfp-search-sb",
"https://fr.search.yahoo.com/search?p=&fr=yfp-search-sb&b=11&pz=10"}

#funtion i want to speed up
def checker():
    global temptotal
    global loaded
    global dorksdone
    global tempourl
    proxy = set()    
    with open(proxy_list, "r") as f:
        file_lines1 = f.readlines()
        for line1 in file_lines1:
            proxy.add(line1.strip())    
    with open(dorkslist, mode="r",encoding="utf-8") as my_file:
        for line in my_file:
            loaded += 1
            threading.Thread(target=titre).start()    
            indorks = line
            encode = requote_uri(indorks)
            for yahoo in yahoourl:
                yahooo = yahoo.replace("&fr",encode + "&fr")
                try:
                    proxies = {
                    'http': prox+'://'+random.choice(list(proxy))
                    }    
                    r = requests.get(yahooo, proxies=proxies)
                    print("Dorks used :" + indorks )
                    dorksdone += 1
                    soup = bs(r.text, 'html.parser')
                    links =  soup.find_all('a')
                    for link in soup.find_all('a'):
                        a = link.get('href')
                        unquote(a)
                        temptotal += 1
                        with open("Bing.txt", mode="a",encoding="utf-8") as fullz:
                            fullz.write(a + "\n")
                            fullz.close()
                        lines_seen = set() # holds lines already seen
                        outfile = open("Bingnodup.txt", "w", encoding="utf-8")
                        for line in open("Bing.txt", "r", encoding="utf-8"):
                            if line not in lines_seen: # not a duplicate                 
                                outfile.write(line)
                                lines_seen.add(line)
                        outfile.close()
                        with open("Bingnodup.txt", mode="r", encoding="utf-8") as cool:
                            for url in cool:            
                                try:
                                    proxies = {
                                    'http': prox+'://'+random.choice(list(proxy))
                                    } 
                                    response = requests.get(url, proxies=proxies)                
                                    save = response.url
                                    with open("Bingtemp.txt", mode="a", encoding="utf-8") as cool1:                    
                                        cool1.write(save + "\n")
                                        tempourl += 1
                                    cool1.close()
                                except:
                                    pass
                except:
                    raise
    fin()
           

#start bot

bot1 = threading.Thread(target=checker)

bot1.start()

bot1.join()

Exemple file for Keyword:

python
wordpress

Exemple file for proxy(http so take 1 on choice):

46.4.96.137:8080
223.71.167.169:80
219.248.205.117:3128
198.24.171.34:8001
51.158.123.35:9999

But this function when running is very very very slow, could who let me know how i can give boost to this function? Because i have try to use this topic: How can I use threading in Python?

But i didn't understand how to build in into the right way for my function.

Answer 1

Your script is what's called I/O bound. What this means is that it is not slow because the CPU needs to perform long computations, but because it needs to wait a lot every time it requests a URL (the bottleneck are the requests to the internet).

For concurrency you have 3 options:

asyncio
threading
multiprocessing

The first two are the ones which can help you in I/O bound problems like yours. The first one is the recommended approach in a problem like this, since there is a library available with support for async/await.

This is an adapted example from the above link, which does exactly what you need:

import asyncio
import time
import aiohttp


def get_proxies():
    if platform.system() == "Linux":
        clear = lambda: os.system('clear')
        clear()
    if platform.system() == "Windows":
        clear = lambda: os.system('cls')
        clear()
    proxy = set()
    with open("proxy.txt", "r") as f:
        file_lines1 = f.readlines()
        for line1 in file_lines1:
            proxy.add(line1.strip())
    return proxy


async def download_site(session, url, proxies):
    async with session.get(url, proxies=proxies) as response:
        save = response.url
        with open("Yahootemp.txt", mode="a", encoding="utf-8") as cool1:
            cool1.write(save + "\n")


async def download_all_sites(sites, proxies):
    async with aiohttp.ClientSession() as session:
        tasks = []
        for url in sites:
            task = asyncio.ensure_future(download_site(session, url, proxies))
            tasks.append(task)
        await asyncio.gather(*tasks, return_exceptions=True)


if __name__ == "__main__":
    proxies = get_proxies()
    proxies = {
        'http': prox + '://' + random.choice(list(proxies))
    }
    sites = []
    with open("Yahoonodup.txt", mode="r", encoding="utf-8") as cool:
        for url in cool:            
            sites.append(url)
    asyncio.get_event_loop().run_until_complete(download_all_sites(sites, proxies))

You could make it even faster if saving the files seems to still be too slow; read this .

How to use threading to speed up my function?

Question

1 answers

solution1
1 2021-03-27 17:08:03

How to use threading to speed up my function?

Question

1 answers

solution1 1 2021-03-27 17:08:03

solution1
1 2021-03-27 17:08:03