Python 请求 Stream 通过 Tor - Connection Dies

Question

I'm using python requests library to download the file at 'onionurl' in a multiprocessed fashion, to download a number of files from a tor service.我正在使用 python 请求库以多处理方式在“onionurl”下载文件，以从 tor 服务下载许多文件。

That is the reasoning behind the code.这就是代码背后的原因。

However as these files download, they cut out after a minute or two each.然而，随着这些文件的下载，它们会在一两分钟后被删除。 As in the stream fails to download, no error is given but 'closing text file' is returned.如 stream 下载失败，没有给出错误但返回“关闭文本文件”。 Meaning that it is impossible to download the files hosted on these onion servers, that are several hundred gigabytes each.这意味着无法下载托管在这些洋葱服务器上的文件，每个文件都有数百 GB。

Any help with resolution of this problem would be greatly appreciated.任何有关解决此问题的帮助将不胜感激。

    session = requests.session()
    session.proxies = {}
    session.proxies['http'] = 'socks5h://localhost:9050'
    session.proxies['https'] = 'socks5h://localhost:9050'
    #print(onionurlforrequest)
    

    url = onionurl

    try:
        if not os.path.isdir(foldername):
            os.makedirs(foldername)
        # download the body of response by chunk, not immediately
        with session.get(url, stream=True, verify=False, timeout=1000000) as response:
            # get the total file size
            file_size = int(response.headers.get("Content-Length", 0))
            print(file_size)
            # get the file name

            filename = dataloc


            with open(filename, "wb") as text_file: 
                for chunk in response.iter_content(chunk_size=1024):

                    text_file.write(chunk)
 
                    if (file_size  > 1000000):
                        filesizemb = file_size / 1000000
                    else:
                        filesizemb = 1
            print("closing text file")
            text_file.close()

Answer 1

Managed to solve it, by simply accepting that the connection will die and writing a new function that resumes the download at the exact offset, the theory of which is explained in this question - How to resume file download in Python?设法解决它，只需接受连接将断开并编写一个新的 function 以在确切的偏移量处恢复下载，其理论在这个问题中解释 - 如何在 Python 中恢复文件下载？

My code (warning, messy):我的代码（警告，混乱）：

def onionrequestthreadeddataleakdownloadresume(onionurl,resume_byte_pos):
    print("rerunning")
    companyname = onionurl[0]
    onionurl = onionurl[1]
    dataloc = '/media/archangel/Elements/clop/dataleaks/'
    foldername = dataloc
    dataloc = dataloc + companyname + "/"
    try:
       if not os.path.isdir(dataloc):

           os.mkdir(dataloc)
    except Exception as e:

        print(e)
        print("folder not created")


    filename = os.path.basename(onionurl)
    filenamebasename = filename



    dataloc = dataloc + filename

    try:
 #       seconds = 20
  #      timeout = Timeout(seconds)
   #     timeout.start()



        session = requests.session()
        session.proxies = {}
        session.proxies['http'] = 'socks5h://localhost:9050'
        session.proxies['https'] = 'socks5h://localhost:9050'
        #print(onionurlforrequest)
        
      #  onionurlforrequest = "http://" + onionurl
        print("dataloc")
        print(dataloc)
        print("onionurl")
        print(onionurl)
        url = onionurl

        try:
            print("url")
            print(url)
            if not os.path.isdir(foldername):
                os.makedirs(foldername)
            # download the body of response by chunk, not immediately
#https://stackoverflow.com/questions/16694907/download-large-file-in-python-with-requests?rq=1
            try:
                try:
                    seconds = 20
                    timeout = Timeout(seconds)
                    timeout.start()
                except Exception as ex:
                    print(ex)

                resume_header = {'Accept-Encoding': None, 'Range': 'bytes=%d-' % resume_byte_pos}
                try:
                    with session.get(url, stream=True, verify=False, headers=resume_header, timeout=600) as response:
                        #response.raise_for_status()

                        # get the total file size
                        file_size = int(response.headers['Content-Length'])
                        if (file_size  > 1000000):
                            filesizemb = file_size / 1000000
                        else:
                            filesizemb = 1
                        print(file_size)
                        # get the file name

                        filename = dataloc
            #            filename = os.path.join(dataloc, url.split("/")[-1])
                        # progress bar, changing the unit to bytes instead of iteration (default by tqdm)
             #           response = session.get(url, stream = True)
            #            progress = tqdm(response.iter_content(1024), f"Downloading {filename}", total=file_size, unit="B", unit_scale=True, unit_divisor=1024)
                        try:
                            with open(filename, "ab") as text_file: 
                                for chunk in response.iter_content(chunk_size=1024*1024):
                                    #https://www.kite.com/python/answers/how-to-download-large-files-with-requests-in-python
                                    #if len(chunk) != 1024*36:
                                    if chunk: 
                                        #print(len(chunk))
                                        text_file.write(chunk)
                                        text_file.flush()
                        except Exception as ex:
                            logging.error(f'write failed with error: {ex}')
                            print(ex)
                                #else:
                                
                                    # write data read to the file
                #                    f.write(data)
                                    # update the progress bar manually
                 #                   progress.update(len(data))
                                # finally, if the url is valid

                        #logging.info('Download finished successfully')

                        print("exited with for file")
                except Exception as ex:
                    logging.error(f'Request failed with error: {ex}')
                    print(ex)

            except Exception as ex:
                logging.error(f'Attempt failed with error: {ex}')
                print(ex)

            print("closing text file")
          #  text_file.close()

                #list composed of dataleaklocation (location in external), filename (filename after / slash) , dataleakurl (urlofonion) , contentsize

        except Exception as e:
            print("FAILED DOWNLOAD 2")

            print(e)
    except Exception as e:
        print("FAILED DOWNLOAD 5")
        print(e)












def onionrequestthreadeddataleakdownload2(onionurl):
    companyname = onionurl[0]
    onionurl = onionurl[1]
    dataloc = '/media/archangel/Elements/clop/dataleaks/'
    foldername = dataloc
    dataloc = dataloc + companyname + "/"
    try:
       if not os.path.isdir(dataloc):

           os.mkdir(dataloc)
    except Exception as e:

        print(e)
        print("folder not created")


    filename = os.path.basename(onionurl)
    filenamebasename = filename



    dataloc = dataloc + filename

    try:
 #       seconds = 20
  #      timeout = Timeout(seconds)
   #     timeout.start()



        session = requests.session()
        session.proxies = {}
        session.proxies['http'] = 'socks5h://localhost:9050'
        session.proxies['https'] = 'socks5h://localhost:9050'
        #print(onionurlforrequest)
        
      #  onionurlforrequest = "http://" + onionurl
        print("dataloc")
        print(dataloc)
        print("onionurl")
        print(onionurl)
        url = onionurl

        try:
            print("url")
            print(url)
            if not os.path.isdir(foldername):
                os.makedirs(foldername)
            # download the body of response by chunk, not immediately
#https://stackoverflow.com/questions/16694907/download-large-file-in-python-with-requests?rq=1
            try:
                try:
                    seconds = 20
                    timeout = Timeout(seconds)
                    timeout.start()
                except Exception as ex:
                    print(ex)

               # resume_header = ({'Range': f'bytes=0-2000000'})
                #file_size_online = int(r.headers.get('content-length', 0))
                headersac = {'Accept-Encoding': None}
                try:
                    with session.get(url, stream=True, verify=False, headers = headersac, timeout=600) as response:
                        #response.raise_for_status()

                        # get the total file size
    #                    file_size = int(response.headers.get("Content-Length", 0))
                        file_size = int(response.headers['Content-Length'])
                        if (file_size  > 1000000):
                            filesizemb = file_size / 1000000
                        else:
                            filesizemb = 1
                        print(file_size)
                        #e
                        # get the file name

                        filename = dataloc
            #            filename = os.path.join(dataloc, url.split("/")[-1])
                        # progress bar, changing the unit to bytes instead of iteration (default by tqdm)
             #           response = session.get(url, stream = True)
            #            progress = tqdm(response.iter_content(1024), f"Downloading {filename}", total=file_size, unit="B", unit_scale=True, unit_divisor=1024)
                        try:
                            with open(filename, "wb") as text_file: 
                                for chunk in response.iter_content(chunk_size=1024*1024):
                                    #https://www.kite.com/python/answers/how-to-download-large-files-with-requests-in-python
                                    #if len(chunk) != 1024*36:
                                    if chunk: 
                                       # print(len(chunk))
                                        text_file.write(chunk)
                                        text_file.flush()
                        except Exception as ex:
                            logging.error(f'write failed with error: {ex}')
                            print(ex)
                                #else:
                                
                                    # write data read to the file
                #                    f.write(data)
                                    # update the progress bar manually
                 #                   progress.update(len(data))
                                # finally, if the url is valid

                        #logging.info('Download finished successfully')
                except Exception as ex:
                    logging.error(f'request failed with error: {ex}')
                    print(ex)
                    print("exited with for file")
                #path = Path(filename)
                file_size_offline = Path(filename).stat().st_size
                print("file size offline")
                while (file_size_offline != file_size):
                    try:
                        print(file_size_offline)
                        print(file_size)
                        print("file size incomplete")
                        file_size_offline = Path(filename).stat().st_size
                        onionurllist = []
                        onionurllist.append(companyname)

                        onionurllist.append(onionurl)
                        onionrequestthreadeddataleakdownloadresume(onionurllist, file_size_offline)
                        file_size_offline = Path(filename).stat().st_size

                    except Exception as ex:
                        print("redownload failed")
                        print(ex)
                print("LOOP FINISHED")

                print(file_size)
                print(file_size_offline)
                print(filename)
            except Exception as ex:
                logging.error(f'Attempt failed with error: {ex}')
                print(ex)

#            print("closing text file")
          #  text_file.close()
            if(file_size_offline != file_size):
                while (file_size_offline != file_size):
                    try:
                        print(file_size_offline)
                        print(file_size)
                        print("file size incomplete")
                        file_size_offline = Path(filename).stat().st_size
                        onionurllist = []
                        onionurllist.append(companyname)

                        onionurllist.append(onionurl)
                        onionrequestthreadeddataleakdownloadresume(onionurllist, file_size_offline)
                        file_size_offline = Path(filename).stat().st_size

                    except Exception as ex:
                        print("redownload failed")
                        print(ex)
            else:
                #list composed of dataleaklocation (location in external), filename (filename after / slash) , dataleakurl (urlofonion) , contentsize
                returnedlist = []
                returnedlist.append(dataloc)
                returnedlist.append(filenamebasename)
                returnedlist.append(url)
                returnedlist.append(filesizemb)
                return returnedlist
            if(file_size_offline != file_size):
                print("rerunning a final FINAL time")
                while (file_size_offline != file_size):
                    try:
                        print(file_size_offline)
                        print(file_size)
                        print("file size incomplete")
                        file_size_offline = Path(filename).stat().st_size
                        onionurllist = []
                        onionurllist.append(companyname)

                        onionurllist.append(onionurl)
                        onionrequestthreadeddataleakdownloadresume(onionurllist, file_size_offline)
                        file_size_offline = Path(filename).stat().st_size

                    except Exception as ex:
                        print("redownload failed")
                        print(ex)
            else:
                #list composed of dataleaklocation (location in external), filename (filename after / slash) , dataleakurl (urlofonion) , contentsize
                returnedlist = []
                returnedlist.append(dataloc)
                returnedlist.append(filenamebasename)
                returnedlist.append(url)
                returnedlist.append(filesizemb)
                return returnedlist
                


            returnedlist = []
            returnedlist.append(dataloc)
            returnedlist.append(filenamebasename)
            returnedlist.append(url)
            returnedlist.append(filesizemb)
            return returnedlist
        except Exception as e:
            print("FAILED DOWNLOAD 2")

            print(e)
    except Exception as e:
        print("FAILED DOWNLOAD 5")
        print(e)

Python 请求 Stream 通过 Tor - Connection Dies

问题描述

1 个解决方案

解决方案1
0 已采纳 2020-08-17 13:38:26

Python 请求 Stream 通过 Tor - Connection Dies

问题描述

1 个解决方案

解决方案1 0 已采纳 2020-08-17 13:38:26

解决方案1
0 已采纳 2020-08-17 13:38:26