[英]Python Thread Pool Faster than Go Routines when Scanning AWS S3?
I have recently been digging into understanding Golang concurrency, in particular the use of channels and worker pools.我最近一直在深入了解 Golang 并发,特别是通道和工作池的使用。 I wanted to compare performance between Go and Python (as many have done) because I have mostly read that Go outperforms Python with regard to concurrency. I wanted to compare performance between Go and Python (as many have done) because I have mostly read that Go outperforms Python with regard to concurrency. So I wrote two programs to scan an AWS account's S3 buckets and report back the total size.因此,我编写了两个程序来扫描 AWS 账户的 S3 存储桶并报告总大小。 I performed this on an account that had more the 75 buckets totaling more than a few TB of data.我在一个帐户上执行了此操作,该帐户有更多的 75 个存储桶,总计超过几 TB 的数据。
I was surprised to find that my Python implementation was nearly 2x faster than my Go implementation.我惊讶地发现我的 Python 实现比我的 Go 实现快了近 2 倍。 This confuses me based on all the benchmarks and literature I have read.基于我读过的所有基准和文献,这让我感到困惑。 This leads me to believe that I did not implement my Go code correctly.这让我相信我没有正确实现我的 Go 代码。 While watching both programs run I noticed that the Go implementation only used up to 15% of my CPU while Python used >85%.在观看这两个程序运行时,我注意到 Go 实现仅使用了我的 CPU 的 15%,而 Python 使用了 >85%。 Am I missing an important step with Go or am I missing something in my implementation?我是否错过了 Go 的重要步骤,或者我在实施过程中遗漏了什么? Thanks in advance!提前致谢!
Python Code: Python 代码:
'''
Get the size of all objects in all buckets in S3
'''
import os
import sys
import boto3
import concurrent.futures
def get_s3_bucket_sizes(aws_access_key_id, aws_secret_access_key, aws_session_token=None):
s3client = boto3.client('s3')
# Create the dictionary which will be indexed by the bucket's
# name and has an S3Bucket object as its contents
buckets = {}
total_size = 0.0
#
# Start gathering data...
#
# Get all of the buckets in the account
_buckets = s3client.list_buckets()
cnt = 1
with concurrent.futures.ThreadPoolExecutor(max_workers=50) as executor:
future_bucket_to_scan = {executor.submit(get_bucket_objects, s3client, bucket): bucket for bucket in _buckets["Buckets"]}
for future in concurrent.futures.as_completed(future_bucket_to_scan):
bucket_object = future_bucket_to_scan[future]
try:
ret = future.result()
except Exception as exc:
print('ERROR: %s' % (str(exc)))
else:
total_size += ret
print(total_size)
def get_bucket_objects(s3client, bucket):
name = bucket["Name"]
# Get all of the objects in the bucket
lsbuckets = s3client.list_objects(Bucket=name)
size = 0
while True:
if "Contents" not in lsbuckets.keys():
break
for content in lsbuckets["Contents"]:
size += content["Size"]
break
return size
#
# Main
#
if __name__=='__main__':
get_s3_bucket_sizes(os.environ.get("AWS_ACCESS_KEY_ID"), os.environ.get("AWS_SECRET_ACCESS_KEY"))
Go Code: Go 代码:
package main
import (
"fmt"
"sync"
"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/aws/awserr"
"github.com/aws/aws-sdk-go/aws/session"
"github.com/aws/aws-sdk-go/service/s3"
)
type S3_Bucket_Response struct {
bucket string
count int64
size int64
err error
}
type S3_Bucket_Request struct {
bucket string
region string
}
func get_bucket_objects_async(wg *sync.WaitGroup, requests chan S3_Bucket_Request, responses chan S3_Bucket_Response) {
var size int64
var count int64
for request := range requests {
bucket := request.bucket
region := request.region
// Create a new response
response := new(S3_Bucket_Response)
response.bucket = bucket
sess, err := session.NewSession(&aws.Config{
Region: aws.String(region),
})
s3conn := s3.New(sess)
resp, err := s3conn.ListObjectsV2(&s3.ListObjectsV2Input{
Bucket: aws.String(bucket),
})
if err != nil {
if awsErr, ok := err.(awserr.Error); ok {
switch awsErr.Code() {
case "NoSuchBucket":
response.err = fmt.Errorf("Bucket: (%s) is NoSuchBucket. Must be in process of deleting.", bucket)
case "AccessDenied":
response.err = fmt.Errorf("Bucket: (%s) is AccessDenied. You should really be running this with full Admin Privaleges", bucket)
}
} else {
response.err = fmt.Errorf("Listing Objects Unhandled Error: %s ", err)
}
responses <- *response
continue
}
contents := resp.Contents
size = 0
count = 0
for i:=0; i<len(contents); i++ {
size += *contents[i].Size
count += 1
}
response.size = size
response.count = count
responses <- *response
}
wg.Done()
}
func main() {
var err error
var size int64
var resp *s3.ListBucketsOutput
var wg sync.WaitGroup
sess, _ := session.NewSession()
s3conn := s3.New(sess)
// Get account bucket listing
if resp, err = s3conn.ListBuckets(&s3.ListBucketsInput{});err != nil {
fmt.Println("Error listing buckets: %s", err)
return
}
buckets := resp.Buckets
size = 0
// Create the buffered channels
requests := make(chan S3_Bucket_Request , len(buckets))
responses := make(chan S3_Bucket_Response, len(buckets))
for i := range buckets {
bucket := *buckets[i].Name
resp2, err := s3conn.GetBucketLocation(&s3.GetBucketLocationInput{
Bucket: aws.String(bucket),
})
if err != nil {
fmt.Printf("Could not get bucket location for bucket (%s): %s", bucket, err)
continue
}
wg.Add(1)
go get_bucket_objects_async(&wg, requests, responses)
region := "us-east-1"
if resp2.LocationConstraint != nil {
region = *resp2.LocationConstraint
}
request := new(S3_Bucket_Request)
request.bucket = bucket
request.region = region
requests <- *request
}
// Close requests channel and wait for responses
close(requests)
wg.Wait()
close(responses)
cnt := 1
// Process the results as they come in
for response := range responses {
fmt.Printf("Bucket: (%s) complete! Buckets remaining: %d\n", response.bucket, len(buckets)-cnt)
// Did the bucket request have errors?
if response.err != nil {
fmt.Println(response.err)
continue
}
cnt += 1
size += response.size
}
fmt.Println(size)
return
}
Sorry haven't had an opportunity to fully review this but my answer would be: the solutions don't appear to be equivalent in terms of concurrency .抱歉,没有机会对此进行全面审查,但我的回答是:解决方案在并发方面似乎并不相同。 3 things pop out: 3件事弹出:
GetBucketLocation
) when compared to python.我对 boto 不是很熟悉,但看起来 go 与 python 相比,在每个存储桶的主线程( GetBucketLocation
)上执行了额外的 IO 调用。My next questions would be:我的下一个问题是:
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.